In [1]:
import numpy as np
import pandas as pd
import nltk
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from scipy import spatial

In [2]:
# If it does not exist download the pretrained glove embeddings
! [[ ! -d "data" ]] && mkdir data
! [[ ! -f "data/glove.twitter.27B.100d.txt" ]] && wget "http://nlp.stanford.edu/data/glove.twitter.27B.zip" -O data/temp.zip && unzip -q data/temp.zip -d data && rm data/temp.zip && rm data/glove.twitter.27B.50d.txt data/glove.twitter.27B.200d.txt data/glove.twitter.27B.25d.txt

In [3]:
def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r',encoding="utf8")
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [4]:
# Load glove embeddings and Clickbait dataset
embeddings = loadGloveModel('data/glove.twitter.27B.100d.txt')
df=pd.read_csv('data/cleaned_clickbait.csv')

Loading Glove Model
1193514  words loaded!


In [5]:
def doc_embedding(text):
    text=str(text)
    tokens = word_tokenize(text.lower())
    word_matrix = np.empty((1,100))
    for i in tokens:
        try:
            word_embed = np.array(embeddings[i]).reshape((1,100))
            word_matrix = np.append(word_matrix,word_embed,axis=0)
        except:
            pass
    sentence_embed = np.mean(word_matrix,axis=0).reshape(1,100)
    return sentence_embed

In [6]:
%%time
# Get the embeddings for the post text
df['postTextEmbed'] = [doc_embedding(text)[0] for text in df['postText']]

CPU times: user 2.29 s, sys: 0 ns, total: 2.29 s
Wall time: 2.28 s


In [None]:
%%time
# Get the embedding for the paragraphs
df['targetParagraphsEmbed'] = [doc_embedding(text)[0] for text in df['targetParagraphs']]

In [None]:
%%time
# Get the embedding for the target title
df['targetTitleEmbed'] = [doc_embedding(text)[0] for text in df['targetTitle']]

In [None]:
%%time
# Get the embedding for the target description
df['targetDescriptionEmbed'] = [doc_embedding(text)[0] for text in df['targetDescription']]

In [None]:
%%time
# Get the embedding for the target keywords
df['targetKeywordsEmbed'] = [doc_embedding(text)[0] for text in df['targetKeywords']]

In [None]:
def embed_cosine(col1,col2):
    cos_similaritycol = []
    for i in range(len(col1)):
        val1 = col1.iloc[i]
        val2 = col2.iloc[i]
        similarity = 1 - spatial.distance.cosine(val1, val2)
        cos_similaritycol.append(similarity)
    return cos_similaritycol

In [None]:
%%time
# Embed cosine similarities
df['postText_Paragraphs_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetParagraphsEmbed'])
df['postText_Title_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetTitleEmbed'])
df['postText_Description_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetDescriptionEmbed'])
df['postText_Keywords_Similarity'] = embed_cosine(df['postTextEmbed'], df['targetKeywordsEmbed'])
df['Paragraph_Title_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetTitleEmbed'])
df['Paragraphs_Description_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetDescriptionEmbed'])
df['Paragraphs_Keywords_Similarity'] = embed_cosine(df['targetParagraphsEmbed'], df['targetKeywordsEmbed'])
df['Title_Description_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetDescriptionEmbed'])
df['Title_Keywords_Similarity'] = embed_cosine(df['targetTitleEmbed'], df['targetKeywordsEmbed'])
df['Description_Keywords_Similarity'] = embed_cosine(df['targetDescriptionEmbed'], df['targetKeywordsEmbed'])

In [None]:
# Create new DF with similarity scores
X = df[[
'postText_Paragraphs_Similarity',
'postText_Title_Similarity',
'postText_Description_Similarity',
'postText_Keywords_Similarity',
'Paragraph_Title_Similarity',
'Paragraphs_Description_Similarity',
'Paragraphs_Keywords_Similarity',
'Title_Description_Similarity',
'Title_Keywords_Similarity',
'Description_Keywords_Similarity'
]]

y = df['truthClass']

In [None]:
%%time
# Split training/testing dataset
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
# Fit the XGB model
xgbmodel = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=50, random_state=1, objective='binary:logistic')
xgbmodel.fit(X_train,y_train)

In [None]:
# Evaluate the xgb model
ypred = xgbmodel.predict(X_test)

In [None]:
print(classification_report(y_test,ypred))

In [None]:
print("train score:", xgbmodel.score(X_train, y_train))
print("test score:", xgbmodel.score(X_test, y_test))