In [37]:
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

DATA COLLECTION 

In [38]:
data_fake = pd.read_csv('Fake.csv')
data_true = pd.read_csv('True.csv')

In [39]:
data_fake["class"] = 0
data_true["class"] = 1

In [40]:
data_fake.shape, data_true.shape

((23481, 5), (21417, 5))

DATA PREPROCESSING 

In [41]:
data_merge = pd.concat([data_fake, data_true], axis=0)
data_merge.head(10)

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",0
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017",0
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017",0
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017",0
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017",0


In [42]:
data_merge.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [43]:
data = data_merge.drop(['title','subject','date'], axis = 1)

In [44]:
data.isnull().sum()

text     0
class    0
dtype: int64

In [45]:
data = data.sample(frac = 1)

In [46]:
data.reset_index(inplace = True)
data.drop(['index'], axis = 1, inplace = True)

In [47]:
data.columns

Index(['text', 'class'], dtype='object')

In [48]:
data.head()

Unnamed: 0,text,class
0,MOSCOW/WASHINGTON (Reuters) - A Russian bank ...,1
1,Bill Maher made an appearance on The Tonight S...,0
2,MEXICO CITY (Reuters) - The foreign ministers ...,1
3,ANKARA (Reuters) - Turkish jets struck a targe...,1
4,OSLO (Reuters) - A survivor of the Hiroshima a...,1


DATA CLEANING 

In [49]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

def preprocessing(text):
    text = text.lower()
    text = re.sub('\[.*?\]','',text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+','',text)
    text = re.sub('<.*?>+','',text)
    text = re.sub('[%s]' % re.escape(string.punctuation) ,'',text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*','',text)
    text = re.sub('[%s]' % re.escape(string.punctuation) ,'',text)  #Removes all punctuation characters using string.punctuation
    words = word_tokenize(text)                 #Tokenize the words
    stop_words = set(stopwords.words('english'))    #stores all stopwords in english as a set stop_words
    words = [word for word in words if word not in stop_words]  #remove stopwords
    stemmer = PorterStemmer()               #porterstemmer initializes to stemmer 
    words = [stemmer.stem(word) for word in words]  #stemming process
    text = ' '.join(words)                  #Joins all the words after stopwords removal and stemmming as a text
    return text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nandu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nandu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
data['text'] = data['text'].apply(preprocessing)
data.head(10)

Unnamed: 0,text,class
0,moscow washington reuter russian bank western ...,1
1,bill maher made appear tonight show jimmi fall...,0
2,mexico citi reuter foreign minist mexico hondu...,1
3,ankara reuter turkish jet struck target northe...,1
4,oslo reuter survivor hiroshima atom bomb leade...,1
5,trump bring churchil back white hous obama emb...,0
6,vilniu kiev reuter ukrainian presid petro poro...,1
7,donald trump complet disappoint inappropri rea...,0
8,ginni thoma one hard work patriot women americ...,0
9,berlin reuter german polic tuesday detain six ...,1


FEATURE SELECTION 

In [51]:
x = data['text']
y = data['class']

In [70]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25)
print(x_test.info())

<class 'pandas.core.series.Series'>
Index: 11225 entries, 2315 to 17177
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
11225 non-null  object
dtypes: object(1)
memory usage: 175.4+ KB
None


In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
print(x_test.head(10))
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)
print(xv_test)
from joblib import dump
dump(vectorization, 'f.joblib')

28424    spite senat john mccain promis constitu month ...
41959    crook hillari alway crook better anyon america...
9275     ankara reuter presid tayyip erdogan said wedne...
1405     great wonder see posit event push back obama p...
4407     cairo reuter egypt militari begun conduct air ...
6721     washington reuter white hous tuesday declin ru...
11087    washington reuter iraq top sunni muslim politi...
43395    good remind import prevent obama anoint anoth ...
16334    washington reuter senat bob corker republican ...
20617    reuter delawar governor sign law bill ensur ab...
Name: text, dtype: object
  (0, 69492)	0.05498535662602347
  (0, 69416)	0.050880346205020625
  (0, 69177)	0.03299115423856293
  (0, 68397)	0.07182282238549048
  (0, 68233)	0.07696139237144699
  (0, 67984)	0.03163512894241113
  (0, 67826)	0.0252378408010521
  (0, 67467)	0.03677590273701299
  (0, 67461)	0.08619763712093355
  (0, 66229)	0.058170280987005156
  (0, 65673)	0.024654648710071173
  (0, 64796)	0.0478

['f.joblib']

TRAINING AND TESTING OF THE MODEL 

Random Forest -

In [55]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(random_state = 0)
RF.fit(xv_train, y_train)

In [56]:
from joblib import dump
dump(RF, 'RF.pk1')

['RF.pk1']

In [57]:
pred_rf = RF.predict(xv_test)
print(pred_rf)

[0 0 1 ... 1 1 0]


In [58]:
print(" Random Forest Accuracy = ",RF.score(xv_test, y_test))

 Random Forest Accuracy =  0.9882405345211581


In [59]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

cm_rf = confusion_matrix(y_test, pred_rf)
print("Confusion Matrix for Random Forest:")
print(cm_rf)

# Assuming cm_rf is the confusion matrix from your random forest predictions
tn, fp, fn, tp = cm_rf.ravel()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Confusion Matrix for Random Forest:
[[5826   80]
 [  52 5267]]
Precision: 0.9850383392556574
Recall: 0.9902237262643354
F1 Score: 0.9876242265141572


In [60]:
from sklearn.ensemble import RandomForestClassifier

def predict_news_rf(input_text):
    input_text = preprocessing(input_text)
    input_vector = vectorization.transform([input_text])
    prediction = RF.predict(input_vector)
    return "Fake" if prediction[0] == 0 else "Real"

input_text = ""

prediction_rf = predict_news_rf(input_text)
print(f"The input text is predicted as: {prediction_rf}")

The input text is predicted as: Fake


LOGISTIC REGRESSION -

In [61]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train, y_train)


In [62]:
dump(LR, 'LR.pk1')

['LR.pk1']

In [63]:
pred_lr = LR.predict(xv_test)
print(xv_test)
print(pred_lr)

  (0, 69492)	0.05498535662602347
  (0, 69416)	0.050880346205020625
  (0, 69177)	0.03299115423856293
  (0, 68397)	0.07182282238549048
  (0, 68233)	0.07696139237144699
  (0, 67984)	0.03163512894241113
  (0, 67826)	0.0252378408010521
  (0, 67467)	0.03677590273701299
  (0, 67461)	0.08619763712093355
  (0, 66229)	0.058170280987005156
  (0, 65673)	0.024654648710071173
  (0, 64796)	0.04784992782285016
  (0, 64089)	0.018647760066533552
  (0, 63852)	0.056543013313439514
  (0, 63174)	0.08215583171393669
  (0, 62854)	0.021519488581824897
  (0, 62688)	0.14614273133807085
  (0, 62662)	0.04962087187718861
  (0, 61820)	0.04403935442435043
  (0, 59281)	0.01868990375264088
  (0, 59264)	0.032574859881669756
  (0, 58826)	0.07115640788508745
  (0, 58301)	0.0488356816154823
  (0, 57470)	0.04491721874816853
  (0, 57358)	0.08672867828537212
  :	:
  (11224, 10998)	0.03491748835396672
  (11224, 10526)	0.35944516124908227
  (11224, 9499)	0.03900015719978601
  (11224, 9496)	0.028243679281596344
  (11224, 9079)	0

In [64]:
print(" Logistic Regression Accuracy = ",LR.score(xv_test, y_test))

 Logistic Regression Accuracy =  0.985478841870824


In [65]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

cm_rf = confusion_matrix(y_test, pred_lr)
print("Confusion Matrix for Logistic Regression :")
print(cm_rf)

# Assuming cm_rf is the confusion matrix from your random forest predictions
tn, fp, fn, tp = cm_rf.ravel()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Confusion Matrix for Logistic Regression :
[[5810   96]
 [  67 5252]]
Precision: 0.9820493642483171
Recall: 0.9874036473021245
F1 Score: 0.98471922752414


In [66]:
def predict_news(input_text):
    input_text = preprocessing(input_text)
    input_vector = vectorization.transform([input_text])
    prediction = LR.predict(input_vector)
    return "Fake" if prediction[0] == 0 else "Real"

input_text = " "
prediction = predict_news(input_text)
print(f"The input text is predicted as: {prediction}")


The input text is predicted as: Fake
