In [56]:
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('/content/drive/MyDrive/news_articles.csv')

In [3]:
df.head(5)

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


In [4]:
df.shape

(2096, 12)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   author                   2096 non-null   object 
 1   published                2096 non-null   object 
 2   title                    2096 non-null   object 
 3   text                     2050 non-null   object 
 4   language                 2095 non-null   object 
 5   site_url                 2095 non-null   object 
 6   main_img_url             2095 non-null   object 
 7   type                     2095 non-null   object 
 8   label                    2095 non-null   object 
 9   title_without_stopwords  2094 non-null   object 
 10  text_without_stopwords   2046 non-null   object 
 11  hasImage                 2095 non-null   float64
dtypes: float64(1), object(11)
memory usage: 196.6+ KB


In [6]:
df.isnull().sum()

author                      0
published                   0
title                       0
text                       46
language                    1
site_url                    1
main_img_url                1
type                        1
label                       1
title_without_stopwords     2
text_without_stopwords     50
hasImage                    1
dtype: int64

In [7]:
new_df = pd.DataFrame(df['text_without_stopwords'])

In [8]:
new_df['labels'] = df['label']

In [9]:
new_df = new_df.dropna()

In [10]:
new_df.head(5)

Unnamed: 0,text_without_stopwords,labels
0,print pay back money plus interest entire fami...,Real
1,attorney general loretta lynch plead fifth bar...,Real
2,red state fox news sunday reported morning ant...,Real
3,email kayla mueller prisoner tortured isis cha...,Real
4,email healthcare reform make america great sin...,Real


In [11]:
new_df['labels'] = new_df['labels'].map({'Real': 1, 'Fake': 0})

In [12]:
new_df.head(5)

Unnamed: 0,text_without_stopwords,labels
0,print pay back money plus interest entire fami...,1
1,attorney general loretta lynch plead fifth bar...,1
2,red state fox news sunday reported morning ant...,1
3,email kayla mueller prisoner tortured isis cha...,1
4,email healthcare reform make america great sin...,1


In [13]:
new_df.shape

(2046, 2)

In [14]:
new_df['text_without_stopwords'] = new_df['text_without_stopwords'].apply(lambda x: word_tokenize(str(x)))

In [15]:
new_df.head(5)

Unnamed: 0,text_without_stopwords,labels
0,"[print, pay, back, money, plus, interest, enti...",1
1,"[attorney, general, loretta, lynch, plead, fif...",1
2,"[red, state, fox, news, sunday, reported, morn...",1
3,"[email, kayla, mueller, prisoner, tortured, is...",1
4,"[email, healthcare, reform, make, america, gre...",1


In [16]:
Vectorizer = TfidfVectorizer()

In [17]:
X = Vectorizer.fit_transform(new_df['text_without_stopwords'].astype(str))
y = new_df['labels'].values

y

array([1, 1, 1, ..., 1, 1, 1])

In [18]:
X

<2046x46794 sparse matrix of type '<class 'numpy.float64'>'
	with 386269 stored elements in Compressed Sparse Row format>

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33, 
    shuffle=False
)


In [20]:
# Logistic Regression

clf = LogisticRegression()

In [21]:
clf.fit(X_train, y_train)

LogisticRegression()

In [22]:
y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.2958579881656805


In [47]:
# KNN Neighbors

klf = KNeighborsClassifier(n_neighbors = 5)

In [48]:
klf.fit(X_train, y_train)

KNeighborsClassifier()

In [49]:
yk_pred = klf.predict(X_test)

In [50]:
print(accuracy_score(y_test, yk_pred))

0.3343195266272189


In [52]:
# Random Forest

rlf = RandomForestClassifier()

In [53]:
rlf.fit(X_train, y_train)

RandomForestClassifier()

In [54]:
yr_pred = rlf.predict(X_test)

In [55]:
print(accuracy_score(y_test, yr_pred))

0.2958579881656805


In [68]:
# GridSearchCV
n_neighbors = [5,7,10,15]
weights = ['uniform','distance']
leaf_size = [5,10,30,50]
p = [1,2]
n_jobs = [-1]

params = dict(n_neighbors = n_neighbors,
              weights = weights,
              leaf_size = leaf_size,
              p = p,
              n_jobs = n_jobs)

glf = GridSearchCV(klf, params, cv=5, verbose=1, n_jobs=-1)

In [71]:
a = glf.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [75]:
a.best_params_

{'leaf_size': 5, 'n_jobs': -1, 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}

In [76]:
a.best_score_

0.8094890510948904