In [29]:
import numpy as np
import pandas as pd
import nltk
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from scipy import spatial
from sklearn.model_selection import GridSearchCV

In [2]:
# If it does not exist download the pretrained glove embeddings
! [[ ! -d "data" ]] && mkdir data
! [[ ! -f "data/glove.twitter.27B.100d.txt" ]] && wget "http://nlp.stanford.edu/data/glove.twitter.27B.zip" -O data/temp.zip && unzip -q data/temp.zip -d data && rm data/temp.zip && rm data/glove.twitter.27B.50d.txt data/glove.twitter.27B.200d.txt data/glove.twitter.27B.25d.txt

In [2]:
def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r',encoding="utf8")
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [17]:
# Load glove embeddings and Clickbait dataset
embeddings = loadGloveModel('E:\TU Delft\Q3\Information Retrieval\Applied NLP Project\glove.twitter.27B/glove.twitter.27B.100d.txt')
df=pd.read_csv('cleaned_clickbait.csv')

Loading Glove Model
1193514  words loaded!


In [None]:
embeddings = loadGloveModel('data/glove.twitter.27B.100d.txt')
df=pd.read_csv('data/cleaned_clickbait.csv')

In [18]:
def doc_embedding(text):
    text=str(text)
    tokens = word_tokenize(text.lower())
    word_matrix = np.empty((1,100))
    for i in tokens:
        try:
            word_embed = np.array(embeddings[i]).reshape((1,100))
            word_matrix = np.append(word_matrix,word_embed,axis=0)
        except:
            pass
    sentence_embed = np.mean(word_matrix,axis=0).reshape(1,100)
    return sentence_embed

In [19]:
%%time
# Get the embeddings for the post text
df['postTextEmbed'] = [doc_embedding(text)[0] for text in df['postText']]

Wall time: 4.04 s


In [20]:
%%time
# Get the embedding for the paragraphs
df['targetParagraphsEmbed'] = [doc_embedding(text)[0] for text in df['targetParagraphs']]

Wall time: 8min 44s


In [21]:
%%time
# Get the embedding for the target title
df['targetTitleEmbed'] = [doc_embedding(text)[0] for text in df['targetTitle']]

Wall time: 4.32 s


In [22]:
%%time
# Get the embedding for the target description
df['targetDescriptionEmbed'] = [doc_embedding(text)[0] for text in df['targetDescription']]

Wall time: 6.58 s


In [23]:
%%time
# Get the embedding for the target keywords
df['targetKeywordsEmbed'] = [doc_embedding(text)[0] for text in df['targetKeywords']]

Wall time: 4.25 s


In [96]:
np.save('postTextEmbed.npy', df['postTextEmbed']) 
np.save('targetParagraphsEmbed.npy', df['targetParagraphsEmbed']) 
np.save('targetTitleEmbed.npy', df['targetTitleEmbed']) 
np.save('targetDescriptionEmbed.npy', df['targetDescriptionEmbed']) 
np.save('targetKeywordsEmbed.npy', df['targetKeywordsEmbed']) 

In [97]:
df['postTextEmbed'] = np.load('postTextEmbed.npy',allow_pickle='TRUE')
df['targetParagraphsEmbed'] = np.load('targetParagraphsEmbed.npy',allow_pickle='TRUE')
df['targetTitleEmbed'] = np.load('targetTitleEmbed.npy',allow_pickle='TRUE')
df['targetDescriptionEmbed'] = np.load('targetDescriptionEmbed.npy',allow_pickle='TRUE')
df['targetKeywordsEmbed'] = np.load('targetKeywordsEmbed.npy',allow_pickle='TRUE')

In [24]:
def embed_cosine(col1,col2):
    cos_similaritycol = []
    for i in range(len(col1)):
        #print(i)
        val1 = col1.iloc[i]
        val2 = col2.iloc[i]
        similarity = 1 - spatial.distance.cosine(val1, val2)
        cos_similaritycol.append(similarity)
    return cos_similaritycol

In [25]:
%%time
# Embed cosine similarities
df['postText_Paragraphs_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetParagraphsEmbed'])
df['postText_Title_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetTitleEmbed'])
df['postText_Description_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetDescriptionEmbed'])
df['postText_Keywords_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetKeywordsEmbed'])
df['Paragraph_Title_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetTitleEmbed'])
df['Paragraphs_Description_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetDescriptionEmbed'])
df['Paragraphs_Keywords_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetKeywordsEmbed'])
df['Title_Description_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetDescriptionEmbed'])
df['Title_Keywords_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetKeywordsEmbed'])
df['Description_Keywords_Similarity'] = embed_cosine(df['targetDescriptionEmbed'], df['targetKeywordsEmbed'])

Wall time: 12.1 s


In [26]:
df.columns

Index(['postText', 'id', 'targetParagraphs', 'targetTitle', 'postTimestamp',
       'targetKeywords', 'targetDescription', 'Day', 'Hour', 'TimeClass',
       'truthJudgments', 'truthMean', 'truthClass', 'truthMedian', 'truthMode',
       'postTextEmbed', 'targetParagraphsEmbed', 'targetTitleEmbed',
       'targetDescriptionEmbed', 'targetKeywordsEmbed',
       'postText_Paragraphs_Similarity', 'postText_Title_Similarity',
       'postText_Description_Similarity', 'postText_Keywords_Similarity',
       'Paragraph_Title_Similarity', 'Paragraphs_Description_Similarity',
       'Paragraphs_Keywords_Similarity', 'Title_Description_Similarity',
       'Title_Keywords_Similarity', 'Description_Keywords_Similarity'],
      dtype='object')

In [27]:
# Create new DF with similarity scores
X = df[[
'postText_Paragraphs_Similarity',
'postText_Title_Similarity',
'postText_Description_Similarity',
'postText_Keywords_Similarity',
'Paragraph_Title_Similarity',
'Paragraphs_Description_Similarity',
'Paragraphs_Keywords_Similarity',
'Title_Description_Similarity',
'Title_Keywords_Similarity',
'Description_Keywords_Similarity'
]]

y = df['truthClass']

In [28]:
%%time
# Split training/testing dataset
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

Wall time: 322 ms


In [61]:
%%time
parameters = {'max_depth':[5,6,7,8], 'learning_rate':[0.01,0.1,1], 'n_estimators' : [150,200,300,400] }
xgbmodel = GridSearchCV(estimator = xgb.XGBClassifier(random_state=1, objective='binary:logistic'),param_grid = parameters, cv = 5, scoring='accuracy').fit(X_train,y_train)



Wall time: 24min 43s


In [63]:
xgbmodel.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=150, n_jobs=4, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [92]:
# Fit the XGB model
xgbmodel = xgb.XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=200, random_state=1, objective='binary:logistic')
xgbmodel.fit(X_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [93]:
# Evaluate the xgb model
ypred = xgbmodel.predict(X_test)

In [94]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

   clickbait       0.67      0.24      0.35      1428
no-clickbait       0.80      0.96      0.87      4434

    accuracy                           0.79      5862
   macro avg       0.73      0.60      0.61      5862
weighted avg       0.77      0.79      0.75      5862



In [95]:
print("train score:", xgbmodel.score(X_train, y_train))
print("test score:", xgbmodel.score(X_test, y_test))

train score: 0.8016964024568587
test score: 0.785568065506653
