In [59]:
from google.colab import drive
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report, confusion_matrix

import pickle

# Data Preprocessing

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
news = "/content/drive/MyDrive/Colab Notebooks/News Data/News.csv"
df = pd.read_csv(news)
low_memory=False

Shuffle so the dataset isn't all Fake and then all Real articles

In [30]:
from sklearn.utils import shuffle
df = shuffle(df)
df = df.reset_index(drop=True)

Remove the unnamed column to clean up the dataframe

In [31]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

Check for any null values that would upset the processing

In [32]:
print (df.isnull().sum())
df.shape

text    0
ID      0
dtype: int64


(1000, 2)

In [33]:
train_df = df.drop("ID", axis = 1)
train_df.head()

Unnamed: 0,text
0,WASHINGTON (Reuters) - U.S. President Donald T...
1,WASHINGTON (Reuters) - Top White House officia...
2,"ISE-SHIMA, Japan (Reuters) - U.S. President Ba..."
3,If you ve ever watched the Hunger Games movies...
4,The draft version of the Democratic Party plat...


In [34]:
train_label = df.drop("text", axis = 1)
train_label.head()

Unnamed: 0,ID
0,0
1,0
2,0
3,1
4,1


# Data Cleaning

Replace all digits and punctuation

In [35]:
df.replace('\d+', '', regex=True)
df['text'] = df['text'].str.replace('[^\w\s]', '')
df.head()

  


Unnamed: 0,text,ID
0,WASHINGTON Reuters US President Donald Trump ...,0
1,WASHINGTON Reuters Top White House officials ...,0
2,ISESHIMA Japan Reuters US President Barack Ob...,0
3,If you ve ever watched the Hunger Games movies...,1
4,The draft version of the Democratic Party plat...,1


Make all the text lowercase

In [36]:
df['text'] = df['text'].apply(lambda x: x.lower())

# Remove Stop Words

In [39]:
nltk.download('stopwords')
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,ID
0,washington reuters us president donald trump r...,0
1,washington reuters top white house officials m...,0
2,iseshima japan reuters us president barack oba...,0
3,ever watched hunger games movies know effie tr...,1
4,draft version democratic party platform releas...,1


# **Tokenization**

In [19]:
import re
def tokenize(text):
    split = re.split("\W+",text) 
    return split
df['text_token']= df['text'].apply(lambda x: tokenize(x.lower()))
df.head()

Unnamed: 0,text,ID
0,"[abuja, reuters, nigeria, financial, crimes, a...",0
1,"[warsaw, reuters, european, commission, decisi...",0
2,"[editor, note, recognize, following, eloquent,...",1
3,"[nyc, mayor, bill, de, blasio, really, pushing...",1
4,"[washington, reuters, looming, tax, reform, in...",0


# New Dataframe from the cleaning

In [40]:
train_df.to_csv("cleaned_train_df.csv")
train_label.to_csv("cleaned_train_label.csv")

train = "/content/cleaned_train_df.csv"
train = pd.read_csv(train)

test = "/content/cleaned_train_label.csv"
test = pd.read_csv(test)

train.head()

Unnamed: 0.1,Unnamed: 0,text
0,0,WASHINGTON (Reuters) - U.S. President Donald T...
1,1,WASHINGTON (Reuters) - Top White House officia...
2,2,"ISE-SHIMA, Japan (Reuters) - U.S. President Ba..."
3,3,If you ve ever watched the Hunger Games movies...
4,4,The draft version of the Democratic Party plat...


Train/Test split and converting each row to strings

In [42]:
x = train_df[:1000]
y = train_label[:1000]

data = x['text'].apply(lambda x: np.str_(x))
target = y['ID'].apply(lambda x: np.str_(x))
X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size=0.3, random_state= 0)
X_train.shape

(700,)

# **Vectorization**

In [46]:
tfidf_v = TfidfVectorizer()
tfidf_X_train = tfidf_v.fit_transform(X_train)
tfidf_X_test = tfidf_v.transform(X_test)
tfidf_X_train.shape

(700, 19133)

In [47]:
tfidf_X_train.toarray()

array([[0.        , 0.0352985 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.02183615, 0.        , ..., 0.        , 0.        ,
        0.        ]])

# Classification Model

Logistic Regression

In [48]:
logreg = LogisticRegression(class_weight = 'balanced')
logreg.fit(tfidf_X_train, Y_train)
Accuracy = logreg.score(tfidf_X_test, Y_test)
print(f'Accuracy: {round(Accuracy*100,2)}%')

Accuracy: 93.67%


Naive Bayes

In [49]:
NB = MultinomialNB()
NB.fit(tfidf_X_train, Y_train)
Accuracy2 = NB.score(tfidf_X_test, Y_test)
print(f'Accuracy: {round(Accuracy2*100,2)}%')

Accuracy: 91.67%


Decision Tree

In [50]:
clf = DecisionTreeClassifier()
clf.fit(tfidf_X_train, Y_train)
Accuracy3 = clf.score(tfidf_X_test, Y_test)
print(f'Accuracy: {round(Accuracy3*100,2)}%')

Accuracy: 98.33%


Passive Aggressive Classifier

In [51]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_X_train,Y_train)
y_pred=pac.predict(tfidf_X_test)
score=accuracy_score(Y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 96.33%


Decision Tree Confusion Matrix

In [58]:
clf = DecisionTreeClassifier()
clf.fit(tfidf_X_train, Y_train)
y_pred=clf.predict(tfidf_X_test)

print(classification_report(Y_test, y_pred))
print('\n')
print(confusion_matrix(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99       145
           1       0.97      1.00      0.99       155

    accuracy                           0.99       300
   macro avg       0.99      0.99      0.99       300
weighted avg       0.99      0.99      0.99       300



[[141   4]
 [  0 155]]


# **Save The Model**

In [60]:
pickle.dump(clf, open('./fake_news_model.pkl', 'wb'))