# Clickbait vs Non-Clickbait

Dataset contatining 16000 of each  Clickbait and Non-Clickbait headlines was first published in the following [paper](http://cse.iitkgp.ac.in/~abhijnan/papers/chakraborty_clickbait_asonam16.pdf)  I simply resolve to do the classification in a much easier way. 




###  Learning about the Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
clickbait = pd.read_csv('dataset/clickbait_data', sep="\n", header=None, names=['title'])

In [3]:
clickbait['bait'] = 1

In [4]:
clickbait[:5]

Unnamed: 0,title,bait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [5]:
clickbait.shape

(15999, 2)

In [6]:
non_clickbait = pd.read_csv('dataset/non_clickbait_data', sep="\n", header=None, names=['title'] )

In [7]:
non_clickbait['bait'] = 0

In [8]:
non_clickbait[:5]

Unnamed: 0,title,bait
0,Bill Changing Credit Card Rules Is Sent to Oba...,0
1,"In Hollywood, the Easy-Money Generation Toughe...",0
2,1700 runners still unaccounted for in UK's Lak...,0
3,Yankees Pitchers Trade Fielding Drills for Put...,0
4,Large earthquake rattles Indonesia; Seventh in...,0


In [9]:
non_clickbait.shape

(16001, 2)

In [10]:
clickbait.shape, non_clickbait.shape

((15999, 2), (16001, 2))

In [11]:
dataset = pd.concat([clickbait,non_clickbait])

In [12]:
dataset.shape

(32000, 2)

In [13]:
dataset.head(), dataset.tail()

(                                               title  bait
 0                                 Should I Get Bings     1
 1      Which TV Female Friend Group Do You Belong In     1
 2  The New "Star Wars: The Force Awakens" Trailer...     1
 3  This Vine Of New York On "Celebrity Big Brothe...     1
 4  A Couple Did A Stunning Photo Shoot With Their...     1,
                                                    title  bait
 15996  To Make Female Hearts Flutter in Iraq, Throw a...     0
 15997  British Liberal Democrat Patsy Calton, 56, die...     0
 15998  Drone smartphone app to help heart attack vict...     0
 15999  Netanyahu Urges Pope Benedict, in Israel, to D...     0
 16000  Computer Makers Prepare to Stake Bigger Claim ...     0)

### Shuffling the dataset

In [14]:
from sklearn.utils import shuffle
dataset = shuffle(dataset, random_state=27).reset_index(drop=True)

In [15]:
dataset.head()

Unnamed: 0,title,bait
0,33 Healthy Things To Eat After You Work Out,1
1,"Calm returns to Salt, Jordan after riots over ...",0
2,"23 ""X-Files"" Gifts That Are Out Of This World",1
3,Which Iconic Britney Spears Song Are You Based...,1
4,Can You Identify The Valentine's Chocolates (W...,1


In [16]:
dataset.describe()

Unnamed: 0,bait
count,32000.0
mean,0.499969
std,0.500008
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [17]:
# checking for Nulls
dataset.isnull().sum()

title    0
bait     0
dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
X = dataset.title
y = dataset.bait
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

pipe1 = Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('mnb', MultinomialNB())])

In [43]:
pipe1.fit(X, y)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [44]:
predicted = pipe1.predict(X_test)

In [45]:
print (np.mean(predicted == y_test))

0.980375


In [46]:
from sklearn.metrics import classification_report, confusion_matrix

In [47]:
print (classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.99      0.97      0.98      4032
          1       0.97      0.99      0.98      3968

avg / total       0.98      0.98      0.98      8000



In [25]:
confusion_matrix(y_test, predicted)

array([[3901,  131],
       [  26, 3942]])

In [26]:
clf.predict(["When you find out what these kids are jumping into, your jaw will drop"])[0]

1

In [27]:
clf.predict_proba(["When you find out what these kids are jumping into, your jaw will drop"])[0]

array([ 0.00213608,  0.99786392])

## PIPELINE 2 - using SGDClassifier

In [30]:
from sklearn.linear_model import SGDClassifier

In [48]:
pipe2 = Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('sgd', SGDClassifier())])
pipe2.fit(X, y)
predicted2 = pipe2.predict(X_test)
print (np.mean(predicted2 == y_test))

0.984875


In [49]:
print (classification_report(y_test, predicted2))

             precision    recall  f1-score   support

          0       0.98      0.99      0.99      4032
          1       0.99      0.98      0.98      3968

avg / total       0.98      0.98      0.98      8000



In [50]:
confusion_matrix(y_test, predicted2)

array([[3993,   39],
       [  82, 3886]])

## PIPELINE 3 - using randomforst

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
pipe3 = Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('rf', RandomForestClassifier())])
pipe3.fit(X, y)
predicted3 = pipe3.predict(X_test)
print (np.mean(predicted3 == y_test))

0.997375


In [36]:
print (classification_report(y_test, predicted3))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      4032
          1       1.00      1.00      1.00      3968

avg / total       1.00      1.00      1.00      8000



In [37]:
confusion_matrix(y_test, predicted3)

array([[4026,    6],
       [  15, 3953]])

In [38]:
pipe3.predict(['UP civic polls: 48 per cent turnout in phase-II, voting peaceful'])[0]

0

In [39]:
pipe3.predict_proba(['UP civic polls: 48 per cent turnout in phase-II, voting peaceful'])[0]

array([ 1.,  0.])

## serialising objects using joblib

In [51]:
from sklearn.externals import joblib

In [52]:
joblib.dump(pipe1, 'model_1.pkl') 

['model_1.pkl']

In [53]:
model_1 = joblib.load('model_1.pkl')

In [63]:
pred_score = model_1.predict_proba(['UP civic polls: 48 per cent turnout in phase-II, voting peaceful'])[0]

In [64]:
prob = {'Clickbate': pred_score[1], 'Not-Clickbate': pred_score[0]}

In [68]:
def scoring(headline):
    """
    scoring function - takes in a headline and returns a dictionary
    :param headline:
    :return:
    """
    pred_score = model_1.predict_proba([headline])[0]
    if ((pred_score[1]*100 > 40) & ((pred_score[1]*100 < 60))):
        tag = 'Maybe Baity'
        color = 'is-warning'
    elif (pred_score[1]*100 > 60):
        tag = 'Looks Safe'
        color = 'is-primary'
    else:
        tag = 'Sounds Baity'
        color = 'is-danger'

    prob = {'P-clickbate': pred_score[1], 'P-notClickbate': pred_score[0], 'tag': tag, 'color':color}
    return prob

In [70]:
a = scoring("UP civic polls: 48 per cent turnout in phase-II, voting peaceful")

In [71]:
a

{'P-clickbate': 0.080821024441829453,
 'P-notClickbate': 0.91917897555817052,
 'color': 'is-danger',
 'tag': 'Sounds Baity'}

# Reference 

Abhijnan Chakraborty, Bhargavi Paranjape, Sourya Kakarla, and Niloy Ganguly. "Stop Clickbait: Detecting and Preventing Clickbaits in Online News Media”. In Proceedings of the 2016 IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining (ASONAM), San Fransisco, US, August 2016.