In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [8]:
df = pd.read_csv("featurextacted.csv")

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clickbaits,bait,Length,AvgWordLength,StoptoContent,Cardinality,WordCount,Verb,Auxiliary,CoorConj
0,0,Declassified records show American inaction du...,0,77.0,6.8,0.2,0.0,10.0,1.0,0.0,0.0
1,1,Indian Maoists blamed for the deaths of sixtee...,0,57.0,5.444444,0.333333,1.0,9.0,2.0,0.0,0.0
2,2,Yahoo!7 creates joint venture with Xtra,0,39.0,5.666667,0.166667,1.0,6.0,3.0,0.0,0.0
3,3,Only Solve This Puzzle If You Love Board Games,1,46.0,4.222222,0.444444,1.0,9.0,5.0,0.0,0.0
4,4,Three dead in murder-suicide shooting at South...,0,81.0,6.454545,0.272727,2.0,11.0,5.0,0.0,0.0


In [10]:
df.drop(["Unnamed: 0"],axis=1,inplace=True)

In [11]:
df.head()

Unnamed: 0,Clickbaits,bait,Length,AvgWordLength,StoptoContent,Cardinality,WordCount,Verb,Auxiliary,CoorConj
0,Declassified records show American inaction du...,0,77.0,6.8,0.2,0.0,10.0,1.0,0.0,0.0
1,Indian Maoists blamed for the deaths of sixtee...,0,57.0,5.444444,0.333333,1.0,9.0,2.0,0.0,0.0
2,Yahoo!7 creates joint venture with Xtra,0,39.0,5.666667,0.166667,1.0,6.0,3.0,0.0,0.0
3,Only Solve This Puzzle If You Love Board Games,1,46.0,4.222222,0.444444,1.0,9.0,5.0,0.0,0.0
4,Three dead in murder-suicide shooting at South...,0,81.0,6.454545,0.272727,2.0,11.0,5.0,0.0,0.0


In [34]:
#Splitting the Data into Train and Test with 0.25 as test sets
X_train,X_test,y_train,y_test=train_test_split(df.Clickbaits,df.bait,test_size=0.20,random_state=0)

In [35]:
#Using the Tfid Vectorizer because the size of the dataset is Large
vectorizer = TfidfVectorizer()
tfidf_train_x = vectorizer.fit_transform(X_train)

In [36]:
#Now using the Logistic Regression Algorithm
classifier = LogisticRegression()
classifier.fit(tfidf_train_x, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
#After transform the test data will be stored as a float in case of TFID
tfidf_test_x = vectorizer.transform(X_test)
print(tfidf_test_x.shape)
tfidf_test_x

(6400, 20649)


<6400x20649 sparse matrix of type '<class 'numpy.float64'>'
	with 54130 stored elements in Compressed Sparse Row format>

In [38]:
#Finding the Accuracy Score for the Algorithm
accuracy = cross_val_score(classifier, tfidf_test_x, y_test, cv=5)
acc = accuracy.mean()
print(acc * 100)

95.06263312347907




In [41]:
#Enter anything to check its validity
print("Enter the sentence or phrase you want to validate: \n\n")
inpPhrase = [input()]
output = classifier.predict(vectorizer.transform(inpPhrase))
print("---------------------------------------------------------")
if output == 1:
    print("CLICKBAIT")
else:
    print("NOT a CLICKBAIT")
print("---------------------------------------------------------")

Enter the sentence or phrase you want to validate: 



---------------------------------------------------------
NOT a CLICKBAIT
---------------------------------------------------------


In [48]:
df.head(3)

Unnamed: 0,Clickbaits,bait,Length,AvgWordLength,StoptoContent,Cardinality,WordCount,Verb,Auxiliary,CoorConj
0,Declassified records show American inaction du...,0,77.0,6.8,0.2,0.0,10.0,1.0,0.0,0.0
1,Indian Maoists blamed for the deaths of sixtee...,0,57.0,5.444444,0.333333,1.0,9.0,2.0,0.0,0.0
2,Yahoo!7 creates joint venture with Xtra,0,39.0,5.666667,0.166667,1.0,6.0,3.0,0.0,0.0


## Training Data

In [50]:
X = df.drop(["Clickbaits","bait"],axis=1)
y = df.bait

In [51]:
#Splitting the Data into Train and Test with 0.25 as test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

In [52]:
logmodel = LogisticRegression()

In [53]:
logmodel.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [54]:
predicted = logmodel.predict(X_test)

In [56]:
classification_report(y_test,predicted)

'              precision    recall  f1-score   support\n\n           0       0.75      0.77      0.76      3179\n           1       0.77      0.75      0.76      3221\n\n    accuracy                           0.76      6400\n   macro avg       0.76      0.76      0.76      6400\nweighted avg       0.76      0.76      0.76      6400\n'

In [59]:
confusion_matrix(y_test,predicted)

array([[2445,  734],
       [ 810, 2411]])

In [62]:
accuracy_score(y_test,predicted)

0.75875