In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv("featurextacted.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clickbaits,bait,Length,AvgWordLength,StoptoContent,Cardinality,WordCount,Verb,Auxiliary,CoorConj,polarity,subjectivity
0,0,25 Adorable Animals To Brighten Your Day,1,40.0,4.857143,0.285714,1.0,7.0,1.0,0.0,0.0,0.5,1.0
1,1,Myanmar Junta threatened with sanctions by UN ...,0,50.0,5.375,0.25,1.0,8.0,2.0,0.0,0.0,0.0,0.0
2,2,Fannie Mae Says It Needs Another $15 Billion,0,44.0,4.625,0.25,3.0,8.0,4.0,0.0,0.0,0.0,0.0
3,3,World's most-spammed man,0,24.0,7.333333,0.0,3.0,3.0,5.0,0.0,0.0,0.0,0.0
4,4,Yankees power surge blacks out Sox relief in 8...,0,52.0,4.3,0.2,5.0,10.0,6.0,0.0,0.0,0.8,0.4


In [4]:
df.drop(["Unnamed: 0"],axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,Clickbaits,bait,Length,AvgWordLength,StoptoContent,Cardinality,WordCount,Verb,Auxiliary,CoorConj,polarity,subjectivity
0,25 Adorable Animals To Brighten Your Day,1,40.0,4.857143,0.285714,1.0,7.0,1.0,0.0,0.0,0.5,1.0
1,Myanmar Junta threatened with sanctions by UN ...,0,50.0,5.375,0.25,1.0,8.0,2.0,0.0,0.0,0.0,0.0
2,Fannie Mae Says It Needs Another $15 Billion,0,44.0,4.625,0.25,3.0,8.0,4.0,0.0,0.0,0.0,0.0
3,World's most-spammed man,0,24.0,7.333333,0.0,3.0,3.0,5.0,0.0,0.0,0.0,0.0
4,Yankees power surge blacks out Sox relief in 8...,0,52.0,4.3,0.2,5.0,10.0,6.0,0.0,0.0,0.8,0.4


In [6]:
#Splitting the Data into Train and Test with 0.25 as test sets
X_train,X_test,y_train,y_test=train_test_split(df.Clickbaits,df.bait,test_size=0.20,random_state=0)

In [7]:
#Using the Tfid Vectorizer because the size of the dataset is Large
vectorizer = TfidfVectorizer()
tfidf_train_x = vectorizer.fit_transform(X_train)

In [8]:
#Now using the Logistic Regression Algorithm
classifier = LogisticRegression()
classifier.fit(tfidf_train_x, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
#After transform the test data will be stored as a float in case of TFID
tfidf_test_x = vectorizer.transform(X_test)
print(tfidf_test_x.shape)
tfidf_test_x

(6400, 20715)


<6400x20715 sparse matrix of type '<class 'numpy.float64'>'
	with 54315 stored elements in Compressed Sparse Row format>

In [10]:
#Finding the Accuracy Score for the Algorithm
accuracy = cross_val_score(classifier, tfidf_test_x, y_test, cv=5)
acc = accuracy.mean()
print(acc * 100)

94.46863749497221




In [11]:
#Enter anything to check its validity
print("Enter the sentence or phrase you want to validate: \n\n")
inpPhrase = [input()]
output = classifier.predict(vectorizer.transform(inpPhrase))
print("---------------------------------------------------------")
if output == 1:
    print("CLICKBAIT")
else:
    print("NOT a CLICKBAIT")
print("---------------------------------------------------------")

Enter the sentence or phrase you want to validate: 


 lkjsa dlaksj lksjd ls
---------------------------------------------------------
NOT a CLICKBAIT
---------------------------------------------------------


In [12]:
df.head(10)

Unnamed: 0,Clickbaits,bait,Length,AvgWordLength,StoptoContent,Cardinality,WordCount,Verb,Auxiliary,CoorConj,polarity,subjectivity
0,25 Adorable Animals To Brighten Your Day,1,40.0,4.857143,0.285714,1.0,7.0,1.0,0.0,0.0,0.5,1.0
1,Myanmar Junta threatened with sanctions by UN ...,0,50.0,5.375,0.25,1.0,8.0,2.0,0.0,0.0,0.0,0.0
2,Fannie Mae Says It Needs Another $15 Billion,0,44.0,4.625,0.25,3.0,8.0,4.0,0.0,0.0,0.0,0.0
3,World's most-spammed man,0,24.0,7.333333,0.0,3.0,3.0,5.0,0.0,0.0,0.0,0.0
4,Yankees power surge blacks out Sox relief in 8...,0,52.0,4.3,0.2,5.0,10.0,6.0,0.0,0.0,0.8,0.4
5,US congressmen sue Obama for military action i...,0,53.0,5.0,0.333333,5.0,9.0,7.0,0.0,0.0,0.0,0.1
6,"In Sri Lanka, Kouchner and Miliband Urge Truce",0,46.0,4.875,0.25,5.0,8.0,7.0,0.0,1.0,0.0,0.0
7,NCAA Football: Three Gopher players arrested f...,0,71.0,7.0,0.111111,6.0,9.0,9.0,0.0,1.0,0.0,0.1
8,A Girl Snapchatted Herself As Disney Princesse...,1,65.0,5.0,0.545455,6.0,11.0,10.0,1.0,2.0,1.0,1.0
9,Plan to Shift Military Spending Faces Skepticism,0,48.0,6.0,0.142857,6.0,7.0,13.0,1.0,2.0,-0.1,0.1


## Training Data

In [13]:
X = df.drop(["Clickbaits","bait"],axis=1)
y = df.bait

In [14]:
#Splitting the Data into Train and Test with 0.25 as test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

In [15]:
logmodel = LogisticRegression()

In [16]:
logmodel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
predicted = logmodel.predict(X_test)

In [18]:
classification_report(y_test,predicted)

'              precision    recall  f1-score   support\n\n           0       0.78      0.79      0.79      3157\n           1       0.79      0.79      0.79      3243\n\n    accuracy                           0.79      6400\n   macro avg       0.79      0.79      0.79      6400\nweighted avg       0.79      0.79      0.79      6400\n'

In [19]:
confusion_matrix(y_test,predicted)

array([[2498,  659],
       [ 695, 2548]])

In [20]:
accuracy_score(y_test,predicted)

0.7884375

In [21]:
# Save the trained model as a pickle string. 
saved_model = pickle.dumps(logmodel) 
  
# Load the pickled model 
logmodelPickled = pickle.loads(saved_model) 
  
# Use the loaded pickled model to make predictions 
logmodelPickled.predict(X_test) 
logmodelPickled.score(X_test,y_test)

0.7884375

In [22]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(logmodel, open(filename, 'wb'))

# some time later...

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
# classify = loaded_model.predict("get a chance to win $10,000 by clicking this link")
print(result)

0.7884375


In [23]:
X_test.head()

Unnamed: 0,Length,AvgWordLength,StoptoContent,Cardinality,WordCount,Verb,Auxiliary,CoorConj,polarity,subjectivity
31330,39.0,4.714286,0.571429,12110.0,7.0,37934.0,10389.0,3592.0,0.0,0.0
3514,70.0,4.916667,0.416667,1342.0,12.0,4288.0,1188.0,416.0,-0.155556,0.288889
12363,45.0,4.75,0.375,4703.0,8.0,14996.0,4052.0,1389.0,0.0,0.0
25927,55.0,4.6,0.3,10000.0,10.0,31345.0,8562.0,2957.0,1.0,1.0
31886,62.0,5.3,0.3,12297.0,10.0,38638.0,10581.0,3654.0,0.136364,0.454545


In [24]:
df["Clickbaits"][9]

'Plan to Shift Military Spending Faces Skepticism'