In [1]:
import numpy as np
import pandas as pd
import fasttext
import fasttext.util
import nltk
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from scipy import spatial

ModuleNotFoundError: No module named 'fasttext'

In [2]:
# If it does not exist download the pretrained fasttext embeddings
! [[ ! -d "data" ]] && mkdir data
! [[ ! -f "data/wiki.simple.bin" ]] && wget "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip" -O data/temp.zip && unzip -q data/temp.zip -d data && rm data/temp.zip && rm -r data/wiki.simple.vec

In [3]:
# Load in the fasttext pretrained embeddings and reduce to 100 dim
ft = fasttext.load_model('data/wiki.simple.bin')
fasttext.util.reduce_model(ft, 100)
ft.get_dimension()



100

In [4]:
# Load in the preprocessed clickbait dataset
df = pd.read_csv('data/cleaned_clickbait.csv')

In [5]:
nltk.download('punkt')
def doc_embedding(text):
    text=str(text)
    tokens = word_tokenize(text.lower())
    word_matrix = np.empty((1,100))
    for i in tokens:
        try:
            word_embed = np.array(ft[i]).reshape((1,100))
            word_matrix = np.append(word_matrix,word_embed,axis=0)
        except:
            pass
    sentence_embed = np.mean(word_matrix,axis=0).reshape(1,100)
    return sentence_embed

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
%%time
# Get the embeddings for the post text
df['postTextEmbed'] = [doc_embedding(text)[0] for text in df['postText']]

CPU times: user 3.63 s, sys: 33.1 ms, total: 3.67 s
Wall time: 3.63 s


In [7]:
%%time
# Get the embedding for the paragraphs
df['targetParagraphsEmbed'] = [doc_embedding(text)[0] for text in df['targetParagraphs']]

CPU times: user 2min 36s, sys: 181 ms, total: 2min 36s
Wall time: 2min 36s


In [8]:
%%time
# Get the embedding for the target title
df['targetTitleEmbed'] = [doc_embedding(text)[0] for text in df['targetTitle']]

CPU times: user 3.52 s, sys: 40.7 ms, total: 3.56 s
Wall time: 3.54 s


In [9]:
%%time
# Get the embedding for the target description
df['targetDescriptionEmbed'] = [doc_embedding(text)[0] for text in df['targetDescription']]

CPU times: user 5.06 s, sys: 30.3 ms, total: 5.09 s
Wall time: 5.06 s


In [10]:
%%time
# Get the embedding for the target keywords
df['targetKeywordsEmbed'] = [doc_embedding(text)[0] for text in df['targetKeywords']]

CPU times: user 3.23 s, sys: 21.1 ms, total: 3.25 s
Wall time: 3.21 s


In [12]:
def embed_cosine(col1,col2):
    cos_similaritycol = []
    for i in range(len(col1)):
        val1 = col1.iloc[i]
        val2 = col2.iloc[i]
        similarity = 1 - spatial.distance.cosine(val1, val2)
        cos_similaritycol.append(similarity)
    return cos_similaritycol

In [13]:
%%time
# Embed cosine similarities
df['postText_Paragraphs_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetParagraphsEmbed'])
df['postText_Title_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetTitleEmbed'])
df['postText_Description_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetDescriptionEmbed'])
df['postText_Keywords_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetKeywordsEmbed'])
df['Paragraph_Title_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetTitleEmbed'])
df['Paragraphs_Description_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetDescriptionEmbed'])
df['Paragraphs_Keywords_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetKeywordsEmbed'])
df['Title_Description_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetDescriptionEmbed'])
df['Title_Keywords_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetKeywordsEmbed'])
df['Description_Keywords_Similarity'] = embed_cosine(df['targetDescriptionEmbed'], df['targetKeywordsEmbed'])

  uu = np.average(np.square(u), weights=w)
  vv = np.average(np.square(v), weights=w)
  uv = np.average(u * v, weights=w)
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  dist = 1.0 - uv / np.sqrt(uu * vv)


CPU times: user 6.9 s, sys: 0 ns, total: 6.9 s
Wall time: 6.9 s


In [14]:
# Create new DF with similarity scores
X = df[[
'postText_Paragraphs_Similarity',
'postText_Title_Similarity',
'postText_Description_Similarity',
'postText_Keywords_Similarity',
'Paragraph_Title_Similarity',
'Paragraphs_Description_Similarity',
'Paragraphs_Keywords_Similarity',
'Title_Description_Similarity',
'Title_Keywords_Similarity',
'Description_Keywords_Similarity'
]]

y = df['truthClass']

In [15]:
%%time
# Split training/testing dataset
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

CPU times: user 22.7 ms, sys: 0 ns, total: 22.7 ms
Wall time: 21 ms


In [16]:
# Fit the XGB model
xgbmodel = xgb.XGBClassifier( max_depth=5,learning_rate=0.1,n_estimators=50,random_state=1,objective='binary:logistic')
xgbmodel.fit(X_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=16, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
# Evaluate the xgb model
ypred = xgbmodel.predict(X_test)

In [18]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

   clickbait       0.67      0.12      0.20      1428
no-clickbait       0.78      0.98      0.87      4434

    accuracy                           0.77      5862
   macro avg       0.72      0.55      0.53      5862
weighted avg       0.75      0.77      0.70      5862



In [19]:
print("train score:", xgbmodel.score(X_train, y_train))
print("test score:", xgbmodel.score(X_test, y_test))

train score: 0.7799064053816905
test score: 0.7712384851586489
