In [1]:
import numpy as np
import pandas as pd
import fasttext
import fasttext.util
import nltk
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from scipy import spatial

In [2]:
# If it does not exist download the pretrained fasttext embeddings
! [[ ! -d "data" ]] && mkdir data
! [[ ! -f "data/wiki.simple.bin" ]] && wget "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip" -O data/temp.zip && unzip -q data/temp.zip -d data && rm data/temp.zip && rm -r data/wiki.simple.vec

In [3]:
# Load in the fasttext pretrained embeddings and reduce to 100 dim
ft = fasttext.load_model('data/wiki.simple.bin')
fasttext.util.reduce_model(ft, 100)
ft.get_dimension()



100

In [4]:
# Load in the preprocessed clickbait dataset
df = pd.read_csv('data/cleaned_clickbait.csv')

In [5]:
nltk.download('punkt')
def doc_embedding(text):
    text=str(text)
    tokens = word_tokenize(text.lower())
    word_matrix = np.empty((1,100))
    for i in tokens:
        try:
            word_embed = np.array(ft[i]).reshape((1,100))
            word_matrix = np.append(word_matrix,word_embed,axis=0)
        except:
            pass
    sentence_embed = np.mean(word_matrix,axis=0).reshape(1,100)
    return sentence_embed

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
%%time
# Get the embeddings for the post text
df['postTextEmbed'] = [doc_embedding(text)[0] for text in df['postText']]

CPU times: user 3.49 s, sys: 36.2 ms, total: 3.52 s
Wall time: 3.52 s


In [7]:
%%time
# Get the embedding for the paragraphs
df['targetParagraphsEmbed'] = [doc_embedding(text)[0] for text in df['targetParagraphs']]

CPU times: user 2min 12s, sys: 166 ms, total: 2min 12s
Wall time: 2min 12s


In [8]:
%%time
# Get the embedding for the target title
df['targetTitleEmbed'] = [doc_embedding(text)[0] for text in df['targetTitle']]

CPU times: user 3.38 s, sys: 20.6 ms, total: 3.4 s
Wall time: 3.39 s


In [9]:
%%time
# Get the embedding for the target description
df['targetDescriptionEmbed'] = [doc_embedding(text)[0] for text in df['targetDescription']]

CPU times: user 4.68 s, sys: 83.3 ms, total: 4.76 s
Wall time: 4.72 s


In [10]:
df

Unnamed: 0,postText,id,targetParagraphs,targetTitle,postTimestamp,targetKeywords,targetDescription,Day,Hour,TimeClass,truthJudgments,truthMean,truthClass,truthMedian,truthMode,postTextEmbed,targetParagraphsEmbed,targetTitleEmbed,targetDescriptionEmbed
0,uk’ respon modern slaveri leav victim destitut...,858462320779026433,thousand modern slaveri victim xanot come forw...,‘inexcusable’ failur uk’ respon modern slaveri...,Sat Apr 29 23:25:41 +0000 2017,"modern slavery, Department For Work And Pensio...",“inexcusable” failur uk’ deal modern slaveri l...,Sat,23,Night,"[0.3333333333, 0.0, 0.3333333333, 0.0, 0.0]",0.133333,no-clickbait,0.000000,0.000000,"[-0.05044374526055022, 0.5679600010520186, -0....","[0.159912214368283, 0.46272698386615935, -0.70...","[0.05919554944643203, 0.48397695956587355, -0....","[0.1450776772954586, 0.4756475419655379, -0.61..."
1,good,858421020331560960,presid donald trump appoint xapro life advoc p...,donald trump appoint pro life advoc assist sec...,Sat Apr 29 20:41:34 +0000 2017,"Americans United for Life, Dr. Charmaine Yoest...",presid donald trump appoint pro life advoc pre...,Sat,20,Night,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.000000,clickbait,1.000000,1.000000,"[0.30677539110183716, -0.03660022094845772, -0...","[-0.15826625374092937, 0.5152577985333295, -0....","[-0.3652144213291732, 0.3495841439474713, -0.5...","[-0.3135107120668346, 0.4963127594779838, -0.6..."
2,forgotten trump roast reliv brutal thrash new ...,858368123753435136,xawhit hou correspondents’ dinner xamost enter...,‘forgotten’ trump roast reliv brutal thrash ne...,Sat Apr 29 17:11:23 +0000 2017,"trump whcd, whcd, white house correspondents d...",presid trump won year white hou correspond din...,Sat,17,Evening,"[0.3333333333, 1.0, 0.3333333333, 0.0, 0.66666...",0.466667,no-clickbait,0.333333,0.333333,"[-0.34043241666883906, 0.518682754852555, -0.7...","[-0.12473862715173127, 0.3709469592780704, -0....","[-0.3224421167710366, 0.4632100716788465, -0.7...","[-0.1863551214337349, 0.4779821344961723, -0.7..."
3,meet happiest dog world,858323428260139008,ador probabl understat ador huski goe maru sha...,meet happiest dog world maru huski look like p...,Sat Apr 29 14:13:46 +0000 2017,"Maru, husky, dogs, pandas, furball, instagram",articl maru huski dog uncanni resembl panda,Sat,14,Afternoon,"[1.0, 0.6666666666, 1.0, 1.0, 1.0]",0.933333,clickbait,1.000000,1.000000,"[-0.03793611228466034, 0.16673304066061972, -0...","[0.0350684677844639, 0.5266103358673198, -1.01...","[-0.05616750717163086, 0.3691286753863096, -0....","[-0.0992146972566843, 0.5849153376184404, -0.9..."
4,tokyo subwai shut amid fear immin north korean...,858283602626347008,tokyo major subwai sai shut line minut receiv ...,tokyo subwai shut amid fear immin north korean...,Sat Apr 29 11:35:31 +0000 2017,"Tokyo,subway,shut,fears,North,Korean,attack",temporari suspen minut affect peopl servic hal...,Sat,11,Morning,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.000000,no-clickbait,0.000000,0.000000,"[-0.1009528338521098, 0.7661230141917864, -0.4...","[-0.03832903854718263, 0.8769802561097524, -0....","[-0.1009528338521098, 0.7661230141917864, -0.4...","[0.22062385590239006, 0.6806483268737793, -0.4..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19533,brazil soccer team pilot final interview plane...,804250183642976256,watch live xajo biden honor senat floor brief ...,nbc new video brazil soccer team pilot’ final ...,Thu Dec 01 09:06:00 +0000 2016,,nbc new,Thu,9,Morning,"[0.3333333333, 0.0, 1.0, 0.6666666666, 0.0]",0.400000,no-clickbait,0.333333,0.000000,"[nan, 0.6642926335334778, -0.7709562588821758,...","[nan, 0.750538294785656, -0.712152223020131, 0...","[nan, 0.5136984319050009, -0.8786058175341728,...","[nan, 0.25603582461675006, -0.6364354391892751..."
19534,😱😱😱😱😱😱😱😱😱😱😱😱😱😱,804156272086020096,novemb politico report eric trump kill deer re...,politico scoop eric trump kill deer,Thu Dec 01 02:52:50 +0000 2016,Politico Scoop: Eric Trump Killed Two Deer,politico scoop eric trump kill deer,Thu,2,Night,"[1.0, 1.0, 1.0, 1.0, 0.0]",0.800000,clickbait,1.000000,1.000000,"[nan, 0.45645102858543396, -0.1882218867540359...","[-0.20182822051719995, 0.5217976072528674, -0....","[nan, 0.3419605953884976, -0.89663394008364, 0...","[nan, 0.3419605953884976, -0.89663394008364, 0..."
19535,french forest high school wai high rise build ...,804149798651588608,forest high school sydnei northern beach wai h...,french forest high school reloc wai high rise ...,Thu Dec 01 02:27:07 +0000 2016,"frenchs forest, northern beaches, sydney, rede...",forest high school sydnei northern beach wai h...,Thu,2,Night,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.000000,no-clickbait,0.000000,0.000000,"[nan, 0.883365980216435, -0.1979980251884886, ...","[-0.012076338886150293, 0.8479884563239066, -0...","[nan, 0.9141233201537814, -0.19972410864595855...","[-0.1634817014137904, 0.8815555612246195, -0.1..."
19536,jeff… bruh,804134698729385984,nfl coach lot inform rememb understand confu a...,angel ram jeff fisher think danni woodhead pla...,Thu Dec 01 01:27:06 +0000 2016,"Humor, Football, NFL, NFC West, Los Angeles Ra...",angel ram new rumor score schedul predict pick...,Thu,1,Night,"[0.0, 0.0, 0.0, 0.0, 0.6666666666]",0.133333,no-clickbait,0.000000,0.000000,"[nan, 0.23781178891658783, -0.6446726123491923...","[-0.30523523680555326, 0.4798598967738346, -0....","[nan, 0.3088175149168819, -0.892419520020485, ...","[nan, 0.40404493383624973, -0.9443311213570482..."


In [11]:
def embed_cosine(col1,col2):
    cos_similaritycol = []
    for i in range(len(col1)):
        #print(i)
        val1 = col1.iloc[i]
        val2 = col2.iloc[i]
        similarity = 1 - spatial.distance.cosine(val1, val2)
        cos_similaritycol.append(similarity)
    return cos_similaritycol

In [12]:
%%time
df['postText_Paragraph_Similarity'] = embed_cosine(df['postTextEmbed'],df['targetParagraphsEmbed'])
df['postText_Title_Similarity'] = embed_cosine(df['postTextEmbed'],df['targetTitleEmbed'])
df['postText_Description_Similarity'] = embed_cosine(df['postTextEmbed'],df['targetDescriptionEmbed'])
df['Paragraph_Title_Similarity'] = embed_cosine(df['targetParagraphsEmbed'],df['targetTitleEmbed'])
df['Paragraph_Description_Similarity'] = embed_cosine(df['targetParagraphsEmbed'],df['targetDescriptionEmbed'])
df['Title_Description_Similarity'] = embed_cosine(df['targetTitleEmbed'],df['targetDescriptionEmbed'])

  uv = np.average(u * v, weights=w)
  uu = np.average(np.square(u), weights=w)
  vv = np.average(np.square(v), weights=w)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


CPU times: user 4.2 s, sys: 7.69 ms, total: 4.21 s
Wall time: 4.21 s


In [13]:
df

Unnamed: 0,postText,id,targetParagraphs,targetTitle,postTimestamp,targetKeywords,targetDescription,Day,Hour,TimeClass,...,postTextEmbed,targetParagraphsEmbed,targetTitleEmbed,targetDescriptionEmbed,postText_Paragraph_Similarity,postText_Title_Similarity,postText_Description_Similarity,Paragraph_Title_Similarity,Paragraph_Description_Similarity,Title_Description_Similarity
0,uk’ respon modern slaveri leav victim destitut...,858462320779026433,thousand modern slaveri victim xanot come forw...,‘inexcusable’ failur uk’ respon modern slaveri...,Sat Apr 29 23:25:41 +0000 2017,"modern slavery, Department For Work And Pensio...",“inexcusable” failur uk’ deal modern slaveri l...,Sat,23,Night,...,"[-0.05044374526055022, 0.5679600010520186, -0....","[0.159912214368283, 0.46272698386615935, -0.70...","[0.05919554944643203, 0.48397695956587355, -0....","[0.1450776772954586, 0.4756475419655379, -0.61...",,,0.0,,,
1,good,858421020331560960,presid donald trump appoint xapro life advoc p...,donald trump appoint pro life advoc assist sec...,Sat Apr 29 20:41:34 +0000 2017,"Americans United for Life, Dr. Charmaine Yoest...",presid donald trump appoint pro life advoc pre...,Sat,20,Night,...,"[0.30677539110183716, -0.03660022094845772, -0...","[-0.15826625374092937, 0.5152577985333295, -0....","[-0.3652144213291732, 0.3495841439474713, -0.5...","[-0.3135107120668346, 0.4963127594779838, -0.6...",,,0.0,,,
2,forgotten trump roast reliv brutal thrash new ...,858368123753435136,xawhit hou correspondents’ dinner xamost enter...,‘forgotten’ trump roast reliv brutal thrash ne...,Sat Apr 29 17:11:23 +0000 2017,"trump whcd, whcd, white house correspondents d...",presid trump won year white hou correspond din...,Sat,17,Evening,...,"[-0.34043241666883906, 0.518682754852555, -0.7...","[-0.12473862715173127, 0.3709469592780704, -0....","[-0.3224421167710366, 0.4632100716788465, -0.7...","[-0.1863551214337349, 0.4779821344961723, -0.7...",0.0,,0.0,0.0,0.920387,0.0
3,meet happiest dog world,858323428260139008,ador probabl understat ador huski goe maru sha...,meet happiest dog world maru huski look like p...,Sat Apr 29 14:13:46 +0000 2017,"Maru, husky, dogs, pandas, furball, instagram",articl maru huski dog uncanni resembl panda,Sat,14,Afternoon,...,"[-0.03793611228466034, 0.16673304066061972, -0...","[0.0350684677844639, 0.5266103358673198, -1.01...","[-0.05616750717163086, 0.3691286753863096, -0....","[-0.0992146972566843, 0.5849153376184404, -0.9...",0.0,,0.0,0.0,0.883333,0.0
4,tokyo subwai shut amid fear immin north korean...,858283602626347008,tokyo major subwai sai shut line minut receiv ...,tokyo subwai shut amid fear immin north korean...,Sat Apr 29 11:35:31 +0000 2017,"Tokyo,subway,shut,fears,North,Korean,attack",temporari suspen minut affect peopl servic hal...,Sat,11,Morning,...,"[-0.1009528338521098, 0.7661230141917864, -0.4...","[-0.03832903854718263, 0.8769802561097524, -0....","[-0.1009528338521098, 0.7661230141917864, -0.4...","[0.22062385590239006, 0.6806483268737793, -0.4...",,,0.0,,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19533,brazil soccer team pilot final interview plane...,804250183642976256,watch live xajo biden honor senat floor brief ...,nbc new video brazil soccer team pilot’ final ...,Thu Dec 01 09:06:00 +0000 2016,,nbc new,Thu,9,Morning,...,"[nan, 0.6642926335334778, -0.7709562588821758,...","[nan, 0.750538294785656, -0.712152223020131, 0...","[nan, 0.5136984319050009, -0.8786058175341728,...","[nan, 0.25603582461675006, -0.6364354391892751...",,,,,,
19534,😱😱😱😱😱😱😱😱😱😱😱😱😱😱,804156272086020096,novemb politico report eric trump kill deer re...,politico scoop eric trump kill deer,Thu Dec 01 02:52:50 +0000 2016,Politico Scoop: Eric Trump Killed Two Deer,politico scoop eric trump kill deer,Thu,2,Night,...,"[nan, 0.45645102858543396, -0.1882218867540359...","[-0.20182822051719995, 0.5217976072528674, -0....","[nan, 0.3419605953884976, -0.89663394008364, 0...","[nan, 0.3419605953884976, -0.89663394008364, 0...",,,,,,
19535,french forest high school wai high rise build ...,804149798651588608,forest high school sydnei northern beach wai h...,french forest high school reloc wai high rise ...,Thu Dec 01 02:27:07 +0000 2016,"frenchs forest, northern beaches, sydney, rede...",forest high school sydnei northern beach wai h...,Thu,2,Night,...,"[nan, 0.883365980216435, -0.1979980251884886, ...","[-0.012076338886150293, 0.8479884563239066, -0...","[nan, 0.9141233201537814, -0.19972410864595855...","[-0.1634817014137904, 0.8815555612246195, -0.1...",,,,,0.000000,
19536,jeff… bruh,804134698729385984,nfl coach lot inform rememb understand confu a...,angel ram jeff fisher think danni woodhead pla...,Thu Dec 01 01:27:06 +0000 2016,"Humor, Football, NFL, NFC West, Los Angeles Ra...",angel ram new rumor score schedul predict pick...,Thu,1,Night,...,"[nan, 0.23781178891658783, -0.6446726123491923...","[-0.30523523680555326, 0.4798598967738346, -0....","[nan, 0.3088175149168819, -0.892419520020485, ...","[nan, 0.40404493383624973, -0.9443311213570482...",,,,,,


In [14]:
# Create new DF with similarity scores
X = df[['postText_Paragraph_Similarity',
'postText_Title_Similarity',
'postText_Description_Similarity',
'Paragraph_Title_Similarity',
'Paragraph_Description_Similarity',
'Title_Description_Similarity']]

y = df['truthClass']

In [15]:
%%time
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

CPU times: user 26.6 ms, sys: 980 µs, total: 27.5 ms
Wall time: 25.6 ms


In [16]:
xgbmodel = xgb.XGBClassifier( max_depth=5,learning_rate=0.1,n_estimators=50,random_state=1,objective='binary:logistic')
xgbmodel.fit(X_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=16, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
ypred = xgbmodel.predict(X_test)

In [18]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

   clickbait       0.69      0.21      0.33      1428
no-clickbait       0.79      0.97      0.87      4434

    accuracy                           0.79      5862
   macro avg       0.74      0.59      0.60      5862
weighted avg       0.77      0.79      0.74      5862



In [19]:
print("train score:", xgbmodel.score(X_train, y_train))
print("test score:", xgbmodel.score(X_test, y_test))

train score: 0.7977478794969289
test score: 0.7852268850221767
