In [1]:
import numpy as np
import pandas as pd
import nltk
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from scipy import spatial

In [2]:
# If it does not exist download the pretrained glove embeddings
! [[ ! -d "data" ]] && mkdir data
! [[ ! -f "data/glove.twitter.27B.100d.txt" ]] && wget "http://nlp.stanford.edu/data/glove.twitter.27B.zip" -O data/temp.zip && unzip -q data/temp.zip -d data && rm data/temp.zip && rm data/glove.twitter.27B.50d.txt data/glove.twitter.27B.200d.txt data/glove.twitter.27B.25d.txt

In [3]:
def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r',encoding="utf8")
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [4]:
# Load glove embeddings and Clickbait dataset
embeddings = loadGloveModel('data/glove.twitter.27B.100d.txt')
df=pd.read_csv('data/cleaned_clickbait.csv')

Loading Glove Model
1193514  words loaded!


In [5]:
def doc_embedding(text):
    text=str(text)
    tokens = word_tokenize(text.lower())
    word_matrix = np.empty((1,100))
    for i in tokens:
        try:
            word_embed = np.array(embeddings[i]).reshape((1,100))
            word_matrix = np.append(word_matrix,word_embed,axis=0)
        except:
            pass
    sentence_embed = np.mean(word_matrix,axis=0).reshape(1,100)
    return sentence_embed

In [6]:
%%time
# Get the embeddings for the post text
df['postTextEmbed'] = [doc_embedding(text)[0] for text in df['postText']]

CPU times: user 2.29 s, sys: 0 ns, total: 2.29 s
Wall time: 2.28 s


In [7]:
%%time
# Get the embedding for the paragraphs
df['targetParagraphsEmbed'] = [doc_embedding(text)[0] for text in df['targetParagraphs']]

CPU times: user 1min 40s, sys: 204 ms, total: 1min 40s
Wall time: 1min 40s


In [8]:
%%time
# Get the embedding for the target title
df['targetTitleEmbed'] = [doc_embedding(text)[0] for text in df['targetTitle']]

CPU times: user 2.22 s, sys: 0 ns, total: 2.22 s
Wall time: 2.21 s


In [9]:
%%time
# Get the embedding for the target description
df['targetDescriptionEmbed'] = [doc_embedding(text)[0] for text in df['targetDescription']]

CPU times: user 2.93 s, sys: 9.85 ms, total: 2.94 s
Wall time: 2.93 s


In [10]:
%%time
# Get the embedding for the target keywords
df['targetKeywordsEmbed'] = [doc_embedding(text)[0] for text in df['targetKeywords']]

CPU times: user 1.96 s, sys: 29.8 ms, total: 1.99 s
Wall time: 1.99 s


In [11]:
def embed_cosine(col1,col2):
    cos_similaritycol = []
    for i in range(len(col1)):
        val1 = col1.iloc[i]
        val2 = col2.iloc[i]
        similarity = 1 - spatial.distance.cosine(val1, val2)
        cos_similaritycol.append(similarity)
    return cos_similaritycol

In [12]:
%%time
# Embed cosine similarities
df['postText_Paragraphs_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetParagraphsEmbed'])
df['postText_Title_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetTitleEmbed'])
df['postText_Description_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetDescriptionEmbed'])
df['postText_Keywords_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetKeywordsEmbed'])
df['Paragraph_Title_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetTitleEmbed'])
df['Paragraphs_Description_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetDescriptionEmbed'])
df['Paragraphs_Keywords_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetKeywordsEmbed'])
df['Title_Description_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetDescriptionEmbed'])
df['Title_Keywords_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetKeywordsEmbed'])
df['Description_Keywords_Similarity'] = embed_cosine(df['targetDescriptionEmbed'], df['targetKeywordsEmbed'])

  uu = np.average(np.square(u), weights=w)


CPU times: user 6.69 s, sys: 9.7 ms, total: 6.7 s
Wall time: 6.7 s


In [13]:
# Create new DF with similarity scores
X = df[[
'postText_Paragraphs_Similarity',
'postText_Title_Similarity',
'postText_Description_Similarity',
'postText_Keywords_Similarity',
'Paragraph_Title_Similarity',
'Paragraphs_Description_Similarity',
'Paragraphs_Keywords_Similarity',
'Title_Description_Similarity',
'Title_Keywords_Similarity',
'Description_Keywords_Similarity'
]]

y = df['truthClass']

In [14]:
%%time
# Split training/testing dataset
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

CPU times: user 25.2 ms, sys: 102 µs, total: 25.3 ms
Wall time: 23.6 ms


In [15]:
# Fit the XGB model
xgbmodel = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=50, random_state=1, objective='binary:logistic')
xgbmodel.fit(X_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=16, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [16]:
# Evaluate the xgb model
ypred = xgbmodel.predict(X_test)

In [17]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

   clickbait       0.67      0.23      0.35      1428
no-clickbait       0.80      0.96      0.87      4434

    accuracy                           0.78      5862
   macro avg       0.73      0.60      0.61      5862
weighted avg       0.76      0.78      0.74      5862



In [18]:
print("train score:", xgbmodel.score(X_train, y_train))
print("test score:", xgbmodel.score(X_test, y_test))

train score: 0.798917812225797
test score: 0.7848857045377005
