In [1]:
import numpy as np
import pandas as pd
import fasttext
import fasttext.util
import nltk
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from scipy import spatial

In [2]:
# If it does not exist download the pretrained fasttext embeddings
! [[ ! -d "data" ]] && mkdir data
! [[ ! -f "data/wiki.simple.bin" ]] && wget "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip" -O data/temp.zip && unzip -q data/temp.zip -d data && rm data/temp.zip && rm -r data/wiki.simple.vec

In [3]:
# Load in the fasttext pretrained embeddings and reduce to 100 dim
ft = fasttext.load_model('data/wiki.simple.bin')
fasttext.util.reduce_model(ft, 100)
ft.get_dimension()



100

In [4]:
# Load in the preprocessed clickbait dataset
df = pd.read_csv('data/cleaned_clickbait.csv')

In [5]:
nltk.download('punkt')
def doc_embedding(text):
    text=str(text)
    tokens = word_tokenize(text.lower())
    word_matrix = np.empty((1,100))
    for i in tokens:
        try:
            word_embed = np.array(ft[i]).reshape((1,100))
            word_matrix = np.append(word_matrix,word_embed,axis=0)
        except:
            pass
    sentence_embed = np.mean(word_matrix,axis=0).reshape(1,100)
    return sentence_embed

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
%%time
# Get the embeddings for the post text
df['postTextEmbed'] = [doc_embedding(text)[0] for text in df['postText']]

CPU times: user 3.56 s, sys: 0 ns, total: 3.56 s
Wall time: 3.55 s


In [7]:
%%time
# Get the embedding for the paragraphs
df['targetParagraphsEmbed'] = [doc_embedding(text)[0] for text in df['targetParagraphs']]

CPU times: user 2min 39s, sys: 321 ms, total: 2min 40s
Wall time: 2min 40s


In [8]:
%%time
# Get the embedding for the target title
df['targetTitleEmbed'] = [doc_embedding(text)[0] for text in df['targetTitle']]

CPU times: user 3.41 s, sys: 20.1 ms, total: 3.43 s
Wall time: 3.42 s


In [9]:
%%time
# Get the embedding for the target description
df['targetDescriptionEmbed'] = [doc_embedding(text)[0] for text in df['targetDescription']]

CPU times: user 4.93 s, sys: 0 ns, total: 4.93 s
Wall time: 4.92 s


In [10]:
%%time
# Get the embedding for the target keywords
df['targetKeywordsEmbed'] = [doc_embedding(text)[0] for text in df['targetKeywords']]

CPU times: user 3.17 s, sys: 9.7 ms, total: 3.18 s
Wall time: 3.17 s


In [11]:
def embed_cosine(col1,col2):
    cos_similaritycol = []
    for i in range(len(col1)):
        val1 = col1.iloc[i]
        val2 = col2.iloc[i]
        similarity = 1 - spatial.distance.cosine(val1, val2)
        cos_similaritycol.append(similarity)
    return cos_similaritycol

In [12]:
%%time
# Embed cosine similarities
df['postText_Paragraphs_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetParagraphsEmbed'])
df['postText_Title_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetTitleEmbed'])
df['postText_Description_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetDescriptionEmbed'])
df['postText_Keywords_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetKeywordsEmbed'])
df['Paragraph_Title_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetTitleEmbed'])
df['Paragraphs_Description_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetDescriptionEmbed'])
df['Paragraphs_Keywords_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetKeywordsEmbed'])
df['Title_Description_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetDescriptionEmbed'])
df['Title_Keywords_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetKeywordsEmbed'])
df['Description_Keywords_Similarity'] = embed_cosine(df['targetDescriptionEmbed'], df['targetKeywordsEmbed'])

  uu = np.average(np.square(u), weights=w)
  vv = np.average(np.square(v), weights=w)
  uv = np.average(u * v, weights=w)
  dist = 1.0 - uv / np.sqrt(uu * vv)


CPU times: user 6.81 s, sys: 0 ns, total: 6.81 s
Wall time: 6.81 s


In [13]:
# Create new DF with similarity scores
X = df[[
'postText_Paragraphs_Similarity',
'postText_Title_Similarity',
'postText_Description_Similarity',
'postText_Keywords_Similarity',
'Paragraph_Title_Similarity',
'Paragraphs_Description_Similarity',
'Paragraphs_Keywords_Similarity',
'Title_Description_Similarity',
'Title_Keywords_Similarity',
'Description_Keywords_Similarity'
]]

y = df['truthClass']

In [14]:
%%time
# Split training/testing dataset
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 22.2 ms


In [15]:
%%time
parameters = {'max_depth': [5, 6, 7, 8], 'learning_rate': [0.01, 0.1, 1], 'n_estimators' : [150, 200, 300, 400] }
xgbmodel = GridSearchCV(estimator = xgb.XGBClassifier(random_state=1, objective='binary:logistic'), param_grid = parameters, cv = 5, scoring='accuracy').fit(X_train,y_train)

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































CPU times: user 57min 42s, sys: 2.81 s, total: 57min 45s
Wall time: 3min 44s


In [16]:
# Evaluate the xgb model
ypred = xgbmodel.predict(X_test)

In [17]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

   clickbait       0.68      0.12      0.20      1428
no-clickbait       0.78      0.98      0.87      4434

    accuracy                           0.77      5862
   macro avg       0.73      0.55      0.53      5862
weighted avg       0.75      0.77      0.70      5862



In [18]:
print("train score:", xgbmodel.score(X_train, y_train))
print("test score:", xgbmodel.score(X_test, y_test))

train score: 0.778590231061714
test score: 0.7715796656431252
