In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

In [2]:
def parse_dataset():
    clickbait = []
    non_clickbait = []
    with open("data/clickbait_data", 'rt') as data_in:
        for line in data_in:
            if line.strip():
                clickbait.append(line.strip())
                
                
    with open("data/non_clickbait_data", 'rt') as data_in:
        for line in data_in:
            if line.strip():
                non_clickbait.append(line.strip())

    return clickbait, non_clickbait

In [3]:
clickbait, non_clickbait = parse_dataset()

In [28]:
def preprocess_titles(titles): 
    return list(map(lambda x: x.lower(), titles))
    
def create_dataframe(clickbait=clickbait, non_clickbait=non_clickbait):
    cb_df = pd.DataFrame({'clickbait': np.ones(len(clickbait)), 'title': preprocess_titles(clickbait)})
    n_cb_df = pd.DataFrame({'clickbait': np.zeros(len(non_clickbait)), 'title': preprocess_titles(non_clickbait)})
    return pd.concat([cb_df, n_cb_df], ignore_index=True)

In [30]:
titles = create_dataframe()

In [31]:
titles[:10]

Unnamed: 0,clickbait,title
0,1.0,should i get bings
1,1.0,which tv female friend group do you belong in
2,1.0,"the new ""star wars: the force awakens"" trailer..."
3,1.0,"this vine of new york on ""celebrity big brothe..."
4,1.0,a couple did a stunning photo shoot with their...
5,1.0,how to flirt with queer girls without making a...
6,1.0,32 cute things to distract from your awkward t...
7,1.0,if disney princesses were from florida
8,1.0,what's a quote or lyric that best describes yo...
9,1.0,natalie dormer and sam claflin play a game to ...


In [32]:
idf_tokenizer = TfidfVectorizer(max_features=30000, stop_words='english').fit(titles['title'])

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(titles['title'], titles['clickbait'],
                                                    stratify=titles['clickbait'], 
                                                    test_size=0.25)

In [35]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix,classification_report

def show_scores(model, x, y):
    cv_score = cross_val_score(model, x, y, cv=5, n_jobs=-1)
    print('Cross val score', cv_score, cv_score.mean())
    predict = cross_val_predict(model, x, y, cv=5, n_jobs=-1)
    print(classification_report(y_pred =predict, y_true=y))
    print("Confusion Matrix:")
    print(confusion_matrix(y, predict))

In [37]:
show_scores(LogisticRegression(), idf_tokenizer.transform(X_train), y_train)

Cross val score [0.94771923 0.94458333 0.94895833 0.94395833 0.94748906] 0.9465416570764609
             precision    recall  f1-score   support

        0.0       0.92      0.97      0.95     12001
        1.0       0.97      0.92      0.95     11999

avg / total       0.95      0.95      0.95     24000

Confusion Matrix:
[[11679   322]
 [  961 11038]]


In [38]:
from sklearn.svm import LinearSVC
show_scores(LinearSVC(), idf_tokenizer.transform(X_train), y_train)

Cross val score [0.95667569 0.95       0.95604167 0.95375    0.95686601] 0.9546666745967162
             precision    recall  f1-score   support

        0.0       0.94      0.97      0.96     12001
        1.0       0.97      0.94      0.95     11999

avg / total       0.95      0.95      0.95     24000

Confusion Matrix:
[[11597   404]
 [  684 11315]]


In [45]:
svc = LinearSVC()

train_tokenized = idf_tokenizer.transform(X_train)
test_tokenized = idf_tokenizer.transform(X_test)

svc.fit(train_tokenized, y_train)

predict = svc.predict(test_tokenized)

In [46]:
print(classification_report(y_pred=predict, y_true=y_test))
print(confusion_matrix(y_test, predict))

             precision    recall  f1-score   support

        0.0       0.95      0.96      0.96      4000
        1.0       0.96      0.95      0.96      4000

avg / total       0.96      0.96      0.96      8000

[[3853  147]
 [ 202 3798]]
