In [83]:
import pandas as pd
import numpy as np
import re
import json_lines
import os
import gzip
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy import spatial

In [18]:
def load_data(path):
    instance_file = open(os.path.join(path, 'instances.jsonl'),'rb')
    truth_file = open(os.path.join(path, 'truth.jsonl'),'rb')
    j_instance = []
    j_truth = []
    for item in json_lines.reader(instance_file):
        j_instance.append(item)
    
    for item in json_lines.reader(truth_file):
        j_truth.append(item)
    
    df_instance = pd.DataFrame(j_instance)
    df_truth = pd.DataFrame(j_truth)
    instance_file.close()
    truth_file.close()
    return df_instance, df_truth

In [19]:
%%time
df_instance , df_truth = load_data('clickbait17-validation-170630/')

Wall time: 21.5 s


In [9]:
def preprocess(text, ps):
    filtered_words = [ps.stem(word) for word in words if word not in stopwords.words('english')]

In [140]:
df_instance = df_instance.drop('postMedia',1)
ps = PorterStemmer()

In [141]:
df_instance.head()

Unnamed: 0,postText,id,targetCaptions,targetParagraphs,targetTitle,postTimestamp,targetKeywords,targetDescription
0,[UK’s response to modern slavery leaving victi...,858462320779026433,[modern-slavery-rex.jpg],[Thousands of modern slavery victims have not ...,‘Inexcusable’ failures in UK’s response to mod...,Sat Apr 29 23:25:41 +0000 2017,"modern slavery, Department For Work And Pensio...",“Inexcusable” failures in the UK’s system for ...
1,[this is good],858421020331560960,"[In this July 1, 2010 file photo, Dr. Charmain...",[President Donald Trump has appointed the pro-...,Donald Trump Appoints Pro-Life Advocate as Ass...,Sat Apr 29 20:41:34 +0000 2017,"Americans United for Life, Dr. Charmaine Yoest...",President Donald Trump has appointed pro-life ...
2,"[The ""forgotten"" Trump roast: Relive his bruta...",858368123753435136,[President Trump will not attend this year's W...,[When the White House correspondents’ dinner i...,The ‘forgotten’ Trump roast: Relive his brutal...,Sat Apr 29 17:11:23 +0000 2017,"trump whcd, whcd, white house correspondents d...",President Trump won't be at this year's White ...
3,[Meet the happiest #dog in the world!],858323428260139008,"[Maru , Maru, Maru, Maru, Maru]",[Adorable is probably an understatement. This ...,"Meet The Happiest Dog In The World, Maru The H...",Sat Apr 29 14:13:46 +0000 2017,"Maru, husky, dogs, pandas, furball, instagram","The article is about Maru, a husky dog who has..."
4,[Tokyo's subway is shut down amid fears over a...,858283602626347008,[All nine lines of Tokyo's subway system were ...,[One of Tokyo's major subways systems says it ...,Tokyo's subway is shut down amid fears over an...,Sat Apr 29 11:35:31 +0000 2017,"Tokyo,subway,shut,fears,North,Korean,attack","The temporary suspension, which lasted ten min..."


In [228]:
df_truth[df_truth['id']=='858323428260139008']
df_truth.head()

Unnamed: 0,truthJudgments,truthMean,id,truthClass,truthMedian,truthMode
0,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,858464162594172928,clickbait,1.0,1.0
1,"[0.3333333333, 0.0, 0.3333333333, 0.0, 0.0]",0.133333,858462320779026433,no-clickbait,0.0,0.0
2,"[0.3333333333, 0.6666666666, 1.0, 0.0, 0.0]",0.4,858460992073863168,no-clickbait,0.333333,0.0
3,"[0.0, 0.6666666666, 0.0, 0.3333333333, 0.33333...",0.266667,858459539296980995,no-clickbait,0.333333,0.333333
4,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,858455355948384257,no-clickbait,0.0,0.0


In [25]:
def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r',encoding="utf8")
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel


In [26]:
embeddings = loadGloveModel(r'glove.twitter.27B/glove.twitter.27B.100d.txt')

Loading Glove Model
1193514  words loaded!


In [34]:
np.save('word_embeddings_dict.npy', embeddings) 

In [2]:
embedding_index = np.load('word_embeddings_dict.npy',allow_pickle='TRUE').item()


In [9]:
embedding_index['hello']

array([ 0.55793  ,  0.10748  , -0.57491  ,  0.4877   , -0.37792  ,
       -0.036457 ,  1.0581   ,  0.059584 , -0.19582  , -0.41366  ,
        0.054969 ,  0.10674  , -2.7076   , -0.50818  , -0.47456  ,
        0.32746  ,  0.41643  , -0.53607  , -0.24822  , -0.63456  ,
       -0.075781 , -1.1904   , -0.72504  ,  0.19499  ,  0.029645 ,
       -0.98157  ,  0.27081  ,  0.32472  ,  0.51154  , -0.86702  ,
       -0.36342  ,  0.14098  , -0.44251  ,  0.24804  ,  0.14021  ,
       -0.042186 ,  0.10408  ,  0.23267  ,  0.26663  ,  0.40316  ,
       -0.91011  ,  0.049339 ,  0.14842  ,  0.70496  , -0.013448 ,
        0.35591  , -0.23494  , -0.83828  ,  0.0069803,  0.44702  ,
       -0.27031  ,  0.0032742,  0.13265  , -0.68583  ,  0.90147  ,
        0.60725  , -0.1849   ,  0.086123 , -0.1693   , -0.48741  ,
        0.33445  , -0.10119  , -0.054273 , -0.35999  , -0.48967  ,
       -0.36699  , -0.91001  , -0.38762  ,  0.14981  ,  0.14092  ,
        0.6064   , -0.2507   ,  0.1582   , -0.33841  , -0.0256

In [172]:
df= df_instance.copy()


In [174]:
df['postText'] = [re.sub('[^a-zA-Z0-9 ]','',str(x)) for x in df['postText']]
df['postText'] = [re.sub('xa0',' ',str(x)) for x in df['postText']]


In [175]:
df['targetParagraphs'] = [re.sub('[^a-zA-Z0-9 ]','',str(x)) for x in df['targetParagraphs']]
df['targetParagraphs'] = [re.sub('xa0',' ',str(x)) for x in df['targetParagraphs']]

In [176]:
df['targetTitle'] = [re.sub('[^a-zA-Z0-9 ]',' ',str(x)) for x in df['targetTitle']]
df['targetTitle'] = [re.sub('xa0',' ',str(x)) for x in df['targetTitle']]
df.iloc[0]['targetTitle']

' Inexcusable  failures in UK s response to modern slavery leaving victims destitute while abusers go free  report warns'

In [178]:
df['targetDescription'] = [re.sub('[^a-zA-Z0-9 ]',' ',str(x)) for x in df['targetDescription']]
df['targetDescription'] = [re.sub('xa0',' ',str(x)) for x in df['targetDescription']]
df.iloc[0]['targetDescription']

' Inexcusable  failures in the UK s system for dealing with modern slavery are leaving victims reduced to destitution while their abusers go free because they are not adequately supported to testify against them  an alarming report has warned '

In [198]:
df['Day'] = [x[:3] for x in df['postTimestamp']]

In [207]:
df['Hour'] = [int(x[11:13]) for x in df['postTimestamp']]

In [94]:
df.to_csv('cleaned_clickbait.csv',index=False)

In [13]:
def doc_embedding(text):
    text=str(text)
    tokens = word_tokenize(text.lower())
    word_matrix = np.empty((1,100))
    for i in tokens:
        try:
            word_embed = np.array(embedding_index[i]).reshape((1,100))
            word_matrix = np.append(word_matrix,word_embed,axis=0)
            #print(i)
        except:
            pass
    sentence_embed = np.mean(word_matrix,axis=0).reshape(1,100)
    return sentence_embed
    

In [11]:
#df=pd.read_csv('cleaned_clickbait.csv')ClickbaitCombined
df=pd.read_csv('ClickbaitCombined.csv')

In [149]:
df.columns

Index(['postText', 'id', 'targetParagraphs', 'targetTitle', 'postTimestamp',
       'targetKeywords', 'targetDescription', 'Day', 'Hour', 'postTextEmbed',
       'targetParagraphsEmbed', 'TimeClass', 'targetTitleEmbed',
       'truthJudgments', 'truthMean', 'truthClass', 'truthMedian',
       'truthMode'],
      dtype='object')

In [51]:
df.head()

Unnamed: 0,postText,id,targetParagraphs,targetTitle,postTimestamp,targetKeywords,targetDescription,Day,Hour,postTextEmbed,targetParagraphsEmbed,TimeClass
0,UKs response to modern slavery leaving victims...,858462320779026433,Thousands of modern slavery victims have not c...,Inexcusable failures in UK s response to mod...,Sat Apr 29 23:25:41 +0000 2017,"modern slavery, Department For Work And Pensio...",Inexcusable failures in the UK s system for ...,Sat,23,[[ 8.58502226e-02 4.12963883e-02 -1.34337450e...,[[ 2.01461506e-01 1.68251195e-01 8.60023832e...,Night
1,this is good,858421020331560960,President Donald Trump has appointed the proli...,Donald Trump Appoints Pro Life Advocate as Ass...,Sat Apr 29 20:41:34 +0000 2017,"Americans United for Life, Dr. Charmaine Yoest...",President Donald Trump has appointed pro life ...,Sat,20,[[-3.89420000e-02 -1.41010000e-02 -6.18507500e...,[[ 2.08973929e-01 1.42481420e-01 8.55740227e...,Night
2,The forgotten Trump roast Relive his brutal 20...,858368123753435136,When the White House correspondents dinner is ...,The forgotten Trump roast Relive his brutal...,Sat Apr 29 17:11:23 +0000 2017,"trump whcd, whcd, white house correspondents d...",President Trump won t be at this year s White ...,Sat,17,[[-1.67562667e-02 2.03187467e-01 1.98795933e...,[[ 1.41164871e-01 1.61774987e-01 1.61848726e...,Evening
3,Meet the happiest dog in the world,858323428260139008,Adorable is probably an understatement This ad...,Meet The Happiest Dog In The World Maru The H...,Sat Apr 29 14:13:46 +0000 2017,"Maru, husky, dogs, pandas, furball, instagram",The article is about Maru a husky dog who has...,Sat,14,[[-0.0864585 0.08173725 0.30983925 0.04832...,[[ 2.07431418e-01 8.05960102e-02 1.64349608e...,Afternoon
4,Tokyos subway is shut down amid fears over an ...,858283602626347008,One of Tokyos major subways systems says it sh...,Tokyo s subway is shut down amid fears over an...,Sat Apr 29 11:35:31 +0000 2017,"Tokyo,subway,shut,fears,North,Korean,attack",The temporary suspension which lasted ten min...,Sat,11,[[ 1.78068687e-01 1.80688687e-01 -2.93053125e...,[[ 2.79123260e-01 2.09590337e-01 6.90849727e...,Morning


In [78]:
%%time
df['postTextEmbed'] = [doc_embedding(text)[0] for text in df['postText']]

Wall time: 6.93 s


In [107]:
%%time
df['targetParagraphsEmbed'] = [doc_embedding(text)[0] for text in df['targetParagraphs']]

Wall time: 53min 49s


In [110]:
%%time
df['targetTitleEmbed'] = [doc_embedding(text)[0] for text in df['targetTitle']]

Wall time: 6.72 s


In [14]:
%%time
df['targetDescriptionEmbed'] = [doc_embedding(text)[0] for text in df['targetDescription']]

Wall time: 1min 5s


In [53]:
#df.to_csv('cleaned_clickbait.csv',index=False)
#df.to_csv('ClickbaitCombined.csv',index=False)

In [42]:
def time_class(time):
    if time >= 4 and time <12:
        return 'Morning'
    elif time >= 12 and time <17:
        return 'Afternoon'
    elif time >= 17 and time <20:
        return 'Evening'
    else:
        return 'Night'

In [46]:
%%time
df['TimeClass'] = [time_class(hour) for hour in df['Hour']]

Wall time: 79 ms


In [91]:
np.save('postTextEmbed.npy', df['postTextEmbed']) 

In [109]:
np.save('targetParagraphsEmbed.npy', df['targetParagraphsEmbed']) 

In [111]:
np.save('targetTitleEmbed.npy', df['targetTitleEmbed']) 

In [15]:
np.save('targetDescriptionEmbed.npy', df['targetDescriptionEmbed']) 

In [28]:
df['postTextEmbed'] = np.load('postTextEmbed.npy',allow_pickle='TRUE')

In [29]:
df['targetParagraphsEmbed'] = np.load('targetParagraphsEmbed.npy',allow_pickle='TRUE')

In [30]:
df['targetTitleEmbed'] = np.load('targetTitleEmbed.npy',allow_pickle='TRUE')

In [31]:
df['targetDescriptionEmbed'] = np.load('targetDescriptionEmbed.npy',allow_pickle='TRUE')

In [119]:
df_truth.to_csv('df_truth',index=False)

In [122]:
df1 =df.copy()

In [142]:
df2 = pd.merge(df1,df_true,left_on='id',right_on='id')

In [144]:
df2.to_csv('ClickbaitCombined.csv',index=False)

In [134]:
df.isnull().sum()

postText                   70
id                          0
targetParagraphs            0
targetTitle                 0
postTimestamp               0
targetKeywords           7229
targetDescription         943
Day                         0
Hour                        0
postTextEmbed               0
targetParagraphsEmbed       0
TimeClass                   0
targetTitleEmbed            0
dtype: int64

In [26]:
nan_rows = df[df['targetDescription'].isnull()]
nan_rows.head()

Unnamed: 0,postText,id,targetParagraphs,targetTitle,postTimestamp,targetKeywords,targetDescription,Day,Hour,postTextEmbed,targetParagraphsEmbed,TimeClass,targetTitleEmbed,truthJudgments,truthMean,truthClass,truthMedian,truthMode,targetDescriptionEmbed
23,This Beyonc menu meme is the funniest thing yo...,857647139127558144,Log in or sign up to create your own posts I a...,This Beyonc Menu Meme Is The Funniest Thing Y...,Thu Apr 27 17:26:26 +0000 2017,,,Thu,17,[ 9.58535385e-02 2.42280015e-01 1.83646923e-...,[ 8.73885562e-02 1.67657784e-01 1.28278459e-...,Evening,[ 1.21077857e-01 1.68097871e-01 9.45985714e-...,"[0.0, 1.0, 0.0, 0.0, 0.3333333333]",0.266667,no-clickbait,0.0,0.0,"[0.34157350000000003, -0.27102350000000003, -0..."
25,14 strangely satisfying videos of melting cheese,857587134491893761,Log in or sign up to create your own posts Thi...,14 Strangely Satisfying Videos Of Melting Cheese,Thu Apr 27 13:28:00 +0000 2017,,,Thu,13,[ 4.73442857e-02 -1.34551429e-02 -6.32954000e-...,[ 7.36610571e-02 1.39061914e-01 -1.50948000e-...,Afternoon,[ 4.73442857e-02 -1.34551429e-02 -6.32954000e-...,"[1.0, 1.0, 0.3333333333, 0.6666666666, 1.0]",0.8,clickbait,1.0,1.0,"[0.028444999999999998, -0.41862, -0.764495, 0...."
40,Cunards Queen Victoria cruise ship is getting ...,857251677904228352,26 Apr 2017 There are three Cunard ocean liner...,Cruises Cunard s Queen Victoria cruise ship is...,Wed Apr 26 15:15:01 +0000 2017,"cruising,cunard_cruises,cunard_cruise,travel,c...",,Wed,15,[-9.69809357e-02 2.27808214e-02 2.31903000e-...,[-8.45518667e-02 4.74529511e-02 1.22065178e-...,Afternoon,[-1.16170728e-01 2.86223056e-02 1.54530667e-...,"[0.0, 0.0, 0.6666666666, 0.0, 0.3333333333]",0.2,no-clickbait,0.0,0.0,"[0.2315, -0.1832185, -0.31141500000000005, 0.5..."
76,16 bad vagina habits you should ditch ASAP,856017796370747392,Log in or sign up to create your own posts Are...,16 Bad Vagina Habits You Should Ditch ASAP,Sun Apr 23 05:32:01 +0000 2017,,,Sun,5,[ 3.89128750e-02 -4.22000000e-02 5.82421250e-...,[ 9.28142694e-02 1.06846034e-02 1.05367077e-...,Morning,[ 1.06649125e-01 -1.81672500e-01 6.04633750e-...,"[0.0, 0.6666666666, 0.6666666666, 1.0, 1.0]",0.666667,clickbait,0.666667,0.666667,"[0.22552500000000003, -0.14725500000000002, -0..."
91,Heres why you really shouldnt try the new Star...,855496610448646146,Log in or sign up to create your own posts Is ...,Here s Why You Really Shouldn t Try The New St...,Fri Apr 21 19:01:00 +0000 2017,,,Fri,19,[-8.03022500e-02 1.92881866e-01 1.92693500e-...,[ 2.48145000e-02 1.17317718e-01 7.35730357e-...,Evening,[-1.04439714e-01 2.48446599e-01 3.34500000e-...,"[0.3333333333, 1.0, 1.0, 1.0, 1.0]",0.866667,clickbait,1.0,1.0,"[0.16346000000000002, -0.063605, -0.2032000000..."


In [23]:
df_instance.iloc[23]

postMedia                         [media/photo_857647119347310592.jpg]
postText             [This Beyoncé menu meme is the funniest thing ...
id                                                  857647139127558144
targetCaptions       [Julie Gerstein, Instagram, Although, come on,...
targetParagraphs     [Log in or sign up to create your own posts., ...
targetTitle          This Beyoncé Menu Meme Is The Funniest Thing Y...
postTimestamp                           Thu Apr 27 17:26:26 +0000 2017
targetKeywords                                                        
targetDescription                                                     
Name: 23, dtype: object

In [61]:
def embed_cosine(col1,col2):
    cos_similaritycol = []
    for i in range(len(col1)):
        #print(i)
        val1 = col1.iloc[i]
        val2 = col2.iloc[i]
        similarity = 1 - spatial.distance.cosine(val1, val2)
        cos_similaritycol.append(similarity)
    return cos_similaritycol
    

In [64]:
%%time
df['postText_Paragraph_Similarity'] = embed_cosine(df['postTextEmbed'],df['targetParagraphsEmbed'])
df['postText_Title_Similarity'] = embed_cosine(df['postTextEmbed'],df['targetTitleEmbed'])
df['postText_Description_Similarity'] = embed_cosine(df['postTextEmbed'],df['targetDescriptionEmbed'])
df['Paragraph_Title_Similarity'] = embed_cosine(df['targetParagraphsEmbed'],df['targetTitleEmbed'])
df['Paragraph_Description_Similarity'] = embed_cosine(df['targetParagraphsEmbed'],df['targetDescriptionEmbed'])
df['Title_Description_Similarity'] = embed_cosine(df['targetTitleEmbed'],df['targetDescriptionEmbed'])

Wall time: 7.75 s


In [66]:
df.to_csv('ClickbaitCombined.csv',index=False)

In [177]:
def stacked_embeds(col):
    stacked_embeds = []
    for i in col:
        stacked_embeds.append(i)
    stacked_embeds = np.asarray(stacked_embeds)
    return stacked_embeds

In [188]:
title_stacked_embeds = stacked_embeds(df['targetTitleEmbed'])
title_stacked_embeds.shape


(19538, 100)

In [189]:
np.save('Title Stacked Embeds.npy', title_stacked_embeds) 

In [3]:
title_stacked_embeds = np.load('Title Stacked Embeds.npy',allow_pickle='TRUE')

In [6]:
title_stacked_embeds.shape

(19538, 100)

In [190]:
paragraph_stacked_embeds = stacked_embeds(df['targetParagraphsEmbed'])
paragraph_stacked_embeds.shape

(19538, 100)

In [191]:
np.save('Paragraph Stacked Embeds.npy', paragraph_stacked_embeds) 

In [67]:
df.columns

Index(['postText', 'id', 'targetParagraphs', 'targetTitle', 'postTimestamp',
       'targetKeywords', 'targetDescription', 'Day', 'Hour', 'postTextEmbed',
       'targetParagraphsEmbed', 'TimeClass', 'targetTitleEmbed',
       'truthJudgments', 'truthMean', 'truthClass', 'truthMedian', 'truthMode',
       'targetDescriptionEmbed', 'postText_Paragraph_Similarity',
       'postText_Title_Similarity', 'postText_Description_Similarity',
       'Paragraph_Title_Similarity', 'Paragraph_Description_Similarity',
       'Title_Description_Similarity'],
      dtype='object')

In [8]:
paragraph_stacked_embeds = np.load('Paragraph Stacked Embeds.npy',allow_pickle='TRUE')

In [84]:
#X = np.append(title_stacked_embeds,paragraph_stacked_embeds,axis=1)
X = df[['postText_Paragraph_Similarity',
'postText_Title_Similarity',
'postText_Description_Similarity',
'Paragraph_Title_Similarity',
'Paragraph_Description_Similarity',
'Title_Description_Similarity']]

In [3]:
df=pd.read_csv('ClickbaitCombined.csv')

In [85]:
y = df['truthClass']

In [93]:
%%time
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

Wall time: 329 ms


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [341]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameters to tune
parameters = { 
    'kernel' : ('linear','poly','rbf','sigmoid'),
    'C': [0.01,0.1,1,10]
}
# Tune yyperparameters  using Grid Search and a SVM model
#model = GridSearchCV(SVC(), param_grid = parameters, cv=10, n_jobs=-1,scoring='accuracy').fit(X_train, y_train)
model = SVC(kernel='poly',C=0.1).fit(X_train, y_train)

Wall time: 3min 1s


In [345]:
y_pred = model.predict(X_test)

In [346]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

   clickbait       0.67      0.18      0.29      1428
no-clickbait       0.79      0.97      0.87      4434

    accuracy                           0.78      5862
   macro avg       0.73      0.58      0.58      5862
weighted avg       0.76      0.78      0.73      5862



In [347]:
print("train score:", model.score(X_train, y_train))
print("test score:", model.score(X_test, y_test))

train score: 0.7812225797016672
test score: 0.7789150460593655


In [None]:
model.best_params_
# C=10 and kernel ='poly'

In [302]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
clf = RandomForestClassifier(n_estimators=400,max_depth=15)
clf.fit(X_train, y_train)

Wall time: 17.1 s


RandomForestClassifier(max_depth=15, n_estimators=400)

In [303]:
y_pred = clf.predict(X_test)

In [304]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

   clickbait       0.64      0.22      0.33      1428
no-clickbait       0.79      0.96      0.87      4434

    accuracy                           0.78      5862
   macro avg       0.72      0.59      0.60      5862
weighted avg       0.76      0.78      0.74      5862



In [305]:
print("train score:", clf.score(X_train, y_train))
print("test score:", clf.score(X_test, y_test))

train score: 0.8661158233401579
test score: 0.7802797679972705


In [298]:
%%time
from sklearn.linear_model import LogisticRegression

parameters ={'C':[0.01,0.1,1,10.0]}
#lr = GridSearchCV(LogisticRegression(max_iter=10000), param_grid = parameters, cv=10, n_jobs=-1,scoring='accuracy').fit(X_train, y_train)
lr = LogisticRegression(C=10,max_iter=20000).fit(X_train, y_train)

Wall time: 264 ms


In [95]:
lr.best_params_

{'C': 10.0}

In [299]:
y_pred = lr.predict(X_test)

In [300]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

   clickbait       0.64      0.16      0.25      1428
no-clickbait       0.78      0.97      0.87      4434

    accuracy                           0.77      5862
   macro avg       0.71      0.56      0.56      5862
weighted avg       0.75      0.77      0.72      5862



In [301]:
print("train score:", lr.score(X_train, y_train))
print("test score:", lr.score(X_test, y_test))

train score: 0.7775665399239544
test score: 0.7732855680655066


In [310]:
%%time
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(X_train, y_train)

Wall time: 3.2 s


AdaBoostClassifier(n_estimators=100, random_state=0)

In [311]:
y_pred = ada.predict(X_test)

In [312]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

   clickbait       0.61      0.20      0.30      1428
no-clickbait       0.79      0.96      0.87      4434

    accuracy                           0.77      5862
   macro avg       0.70      0.58      0.58      5862
weighted avg       0.74      0.77      0.73      5862



In [313]:
print("train score:", ada.score(X_train, y_train))
print("test score:", ada.score(X_test, y_test))

train score: 0.7838549283416204
test score: 0.7736267485499829


In [326]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0,
      max_depth=5, random_state=0).fit(X_train, y_train)


Wall time: 3.03 s


In [327]:
y_predg = gb.predict(X_test)

In [328]:
print(classification_report(y_test,y_predg))

              precision    recall  f1-score   support

   clickbait       0.42      0.27      0.33      1428
no-clickbait       0.79      0.88      0.83      4434

    accuracy                           0.73      5862
   macro avg       0.61      0.57      0.58      5862
weighted avg       0.70      0.73      0.71      5862



In [329]:
print("train score:", gb.score(X_train, y_train))
print("test score:", gb.score(X_test, y_test))

train score: 0.8746709564200058
test score: 0.7326850904128284


In [271]:
import xgboost as xgb
xgbmodel = xgb.XGBClassifier( max_depth=5,learning_rate=0.1,n_estimators=50,random_state=1,objective='binary:logistic')
xgbmodel.fit(X_train,y_train)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=4, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [272]:
ypred = xgbmodel.predict(X_test)

In [273]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

   clickbait       0.67      0.23      0.35      1428
no-clickbait       0.80      0.96      0.87      4434

    accuracy                           0.78      5862
   macro avg       0.73      0.60      0.61      5862
weighted avg       0.76      0.78      0.74      5862



In [274]:
print("train score:", xgbmodel.score(X_train, y_train))
print("test score:", xgbmodel.score(X_test, y_test))

train score: 0.7999415033635566
test score: 0.7848857045377005
