In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [6]:
data_fake= pd.read_csv('Fake.csv')
data_true= pd.read_csv('True.csv')

In [7]:
data_fake['class']= 0
data_true['class']=1

In [10]:
data_fake.shape, data_true.shape  #dimensions of the dataset

((23481, 5), (21417, 5))

In [11]:
# removing the last 10 rows of data from each dataset for manual testing
data_fake_manual_testing= data_fake.tail(10)
for i in range(23480, 23470, -1):
    data_fake.drop([i], axis= 0, inplace=True)

data_true_manual_testing= data_true.tail(10)
for i in range(21416, 21406, -1):
    data_true.drop([i], axis= 0, inplace=True)

In [12]:
data_fake.shape, data_true.shape

((23471, 5), (21407, 5))

In [14]:
data_fake_manual_testing.head()

Unnamed: 0,title,text,subject,date,class
23471,Seven Iranians freed in the prisoner swap have...,"21st Century Wire says This week, the historic...",Middle-east,"January 20, 2016",0
23472,#Hashtag Hell & The Fake Left,By Dady Chery and Gilbert MercierAll writers ...,Middle-east,"January 19, 2016",0
23473,Astroturfing: Journalist Reveals Brainwashing ...,Vic Bishop Waking TimesOur reality is carefull...,Middle-east,"January 19, 2016",0
23474,The New American Century: An Era of Fraud,Paul Craig RobertsIn the last years of the 20t...,Middle-east,"January 19, 2016",0
23475,Hillary Clinton: ‘Israel First’ (and no peace ...,Robert Fantina CounterpunchAlthough the United...,Middle-east,"January 18, 2016",0


In [16]:
data_merge= pd.concat([data_fake, data_true], axis=0) #merging the real and fake datasets

In [17]:
data_merge.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [17]:
data_merge.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [17]:
data_merge.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [18]:
data_merge.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [19]:
# removing unnecessary columns
data= data_merge.drop(['title', 'subject', 'date'], axis=1)

In [20]:
# shuffling the dataframes
data= data.sample(frac=1)

In [21]:
data.head()

Unnamed: 0,text,class
8363,Republicans often claim Jesus as one of their ...,0
15391,HONG KONG (Reuters) - Some activists in Hong K...,1
4468,Monday night s debate between Donald Trump and...,0
11298,In light of Republican failure to pass the Ame...,0
16649,It s great to see one of the Justices question...,0


In [27]:
#reordering the index (rather creating new index)
data.reset_index(inplace= True)
#getting rid of the index column
data.drop(['index'], axis=1, inplace=True)

In [31]:
def wordopt(text):
    text= text.lower()
    text = re.sub(r'\[.*?\]', '', text)  # Remove text within square brackets
    text = re.sub(r'\W', ' ', text)  # Replace non-word characters with space
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove newline characters
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing digits
    return text
    

In [32]:
data['text']= data['text'].apply(wordopt)

In [33]:
data.head()

Unnamed: 0,level_0,text,class
0,0,republicans often claim jesus as one of their ...,0
1,1,hong kong reuters some activists in hong k...,1
2,2,monday night s debate between donald trump and...,0
3,3,in light of republican failure to pass the ame...,0
4,4,it s great to see one of the justices question...,0


In [34]:
x= data['text'] # independent variable
y= data['class'] # dependent variable

In [35]:
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=0.25)

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization= TfidfVectorizer()
xv_train= vectorization.fit_transform(x_train)
xv_test= vectorization.transform(x_test)


In [41]:
from sklearn.linear_model import LogisticRegression
# training the model here
LR= LogisticRegression()
LR.fit(xv_train, y_train)

In [42]:
pred_lr= LR.predict(xv_test)

In [43]:
LR.score(xv_test, y_test)

0.9876114081996435

In [45]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5900
           1       0.99      0.99      0.99      5320

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [46]:
from sklearn.tree import DecisionTreeClassifier
# training the model here
DT= DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [47]:
pred_dt= DT.predict(xv_test)

In [48]:
DT.score(xv_test, y_test)

0.9951871657754011

In [50]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5900
           1       1.00      0.99      0.99      5320

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [52]:
from sklearn.ensemble import GradientBoostingClassifier
# training the model here
GB= GradientBoostingClassifier(random_state=0)
GB.fit(xv_train, y_train)

In [53]:
pred_gb= GB.predict(xv_test)

In [54]:
GB.score(xv_test, y_test)

0.9950980392156863

In [55]:
print(classification_report(y_test, pred_gb))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5900
           1       0.99      1.00      0.99      5320

    accuracy                           1.00     11220
   macro avg       0.99      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [56]:
from sklearn.ensemble import RandomForestClassifier
# training the model here
RF= RandomForestClassifier(random_state=0)
RF.fit(xv_train, y_train)

In [57]:
pred_rf= RF.predict(xv_test)

In [58]:
RF.score(xv_test, y_test)

0.9906417112299465

In [59]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5900
           1       0.99      0.99      0.99      5320

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [60]:
def output_label(n):
    if n==0:
        return "Fake News"
    elif n==1:
        return "Not a Fake News"

In [62]:
def manual_testing(news):
    testing_news= {"text": [news]}
    new_def_test= pd.DataFrame(testing_news)
    new_def_test["text"]= new_def_test["text"].apply(wordopt)
    new_x_test= new_def_test["text"]
    new_xv_test= vectorization.transform(new_x_test)
    pred_LR= LR.predict(new_xv_test)
    pred_DT= DT.predict(new_xv_test)
    pred_GB= GB.predict(new_xv_test)
    pred_RF= RF.predict(new_xv_test)
    return print("\n\n LR Prediction: {} \n DT Prediction: {} \n GBC Prediction: {} \n RFC Prediction: {}".format(output_label(pred_LR[0]),output_label(pred_DT[0]),output_label(pred_GB[0]),output_label(pred_RF[0])))

In [65]:
news= str(input())
manual_testing(news)

 Around 10 equity mutual funds turned Rs 20,000 monthly SIP investments to Rs 1 crore in 10 years, according to data by ACE MF. There were around 96 equity schemes in the market that have completed 10 years of presence.




 LR Prediction: Fake News 
 DT Prediction: Fake News 
 GBC Prediction: Fake News 
 RFC Prediction: Fake News
