# Problem Statement 

# Fake News Detector:

***Build a tool that uses natural language processing and machine learning to identify and flag potentially fake news articles or misinformation.***

##  Importing Required Libraries

In [104]:
import pandas as pd 
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Data Gathering

In [105]:
# Loading the DataSet 
df = pd.read_csv('dataset.csv')

In [106]:
# Printing first 5 Rows of the Dataset
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


## Data Analysis

In [107]:
# Describing the Data in the CSV File
df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [108]:
# Getting the Shape of the Dataset (Rows,Columns)
df.shape

(20800, 5)

In [109]:
# Information about the Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [110]:
# Checking for Missing Values in the Dataset 
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

*The data above indicates the value of missing values present in each column of the dataset. For example , title has 558 missing  values , author has 1957 missing values*

In [111]:
# Replacing the missing values with Nan 
df = df.fillna(' ')

In [112]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [113]:
# Checking for duplicate values present in the Dataset
df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
20795    False
20796    False
20797    False
20798    False
20799    False
Length: 20800, dtype: bool

In [114]:
# Names of the columns present in the dataset
df.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [115]:
# Dropping the columns which are not in use
df = df.drop(['id','author', 'text'] , axis = 1)

In [116]:
df.head()

Unnamed: 0,title,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


## Data Pre Processing

***Printing out the Stopwords for an overview***


In [117]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\my
[nltk_data]     pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [118]:
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [119]:
# Choosing a sample for demostration of the stemming process
sample = 'Hi my name IS SHASHank MIsalA'
sample = sample.split() # Splitting of the above string
sample

['Hi', 'my', 'name', 'IS', 'SHASHank', 'MIsalA']

In [120]:
sample = [data.lower() for data in sample] # Lowering the data present in the sample variable
sample

['hi', 'my', 'name', 'is', 'shashank', 'misala']

In [121]:
sample = [data for data in sample if data not in stop_words] # Removing the Stopwords from the variable sample
sample

['hi', 'name', 'shashank', 'misala']

### Stemming Process

In [122]:
ps = PorterStemmer()

In [123]:
# Defining a function named stem which stems the textual value in df['title']
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y) 

In [124]:
df['title'] = df['title'].apply(stem)

In [125]:
x = df['title']
y = df['label']

In [126]:
x.shape

(20800,)

In [127]:
y.shape

(20800,)

## Model Building & Model Evaluation

In [128]:
# Splitting the Data into train and test 
x_train , x_test , y_train, y_test = train_test_split(x, y, test_size=0.20)

### Vectorizing the Textual Data 

In [129]:
vect=TfidfVectorizer()
x_train=vect.fit_transform(x_train)
x_test=vect.transform(x_test)

In [130]:
# Using the Random Forest Classifier 
RFC = RandomForestClassifier()
RFC.fit(x_train, y_train)
prediction=RFC.predict(x_test)
print("Accuracy of Random Forest classifier: ", RFC.score(x_test, y_test))


Accuracy of Random Forest classifier:  0.9334134615384615


In [131]:
# Confusion Matrix of the Model
confusion_matrix(y_test, prediction)

array([[1833,  246],
       [  31, 2050]], dtype=int64)

In [132]:
# Naive Bayes Algorithm 
nb = MultinomialNB()
nb.fit(x_train, y_train)
nbpredictions = nb.predict(x_test)
print("Accuracy of Naive Bayes : ", nb.score(x_test, y_test))



Accuracy of Naive Bayes :  0.8923076923076924


In [133]:
confusion_matrix(y_test, nbpredictions)

array([[1985,   94],
       [ 354, 1727]], dtype=int64)

In [147]:
svm = SVC(kernel='linear')
svm.fit(x_train, y_train)
svmpredictions = svm.predict(x_test)
print("Accuracy of SVM : ", svm.score(x_test, y_test))


Accuracy of SVM :  0.942548076923077


In [148]:
confusion_matrix(y_test, svmpredictions)

array([[1879,  200],
       [  39, 2042]], dtype=int64)

In [149]:
lr= LogisticRegression()
lr.fit(x_train, y_train)
lrpredictions = lr.predict(x_test)
print("Accuracy of Logistic Regression : ", lr.score(x_test, y_test))


Accuracy of Logistic Regression :  0.9290865384615384


In [150]:
confusion_matrix(y_test, lrpredictions)

array([[1822,  257],
       [  38, 2043]], dtype=int64)

In [160]:
pickle.dump(vect, open('vector.pkl', 'wb'))
pickle.dump(svm, open('model.pkl', 'wb'))

In [161]:
vectorization=pickle.load(open('vector.pkl', 'rb'))
model_selec=pickle.load(open('model.pkl', 'rb'))
pickle.dump(df.to_dict(),open('data_dict.pkl','wb'))

### Defining the Function For The Prediction

In [162]:
def truth(news):
    news=stem(news)
    input_data=[news]
    vector=vectorization.transform(input_data)
    prediction = model_selec.predict(vector)
    return prediction

## Test 1

In [163]:
df['title'][3]

'15 civilian kill in singl us airstrik have been identifi'

In [164]:
val = truth('15 civilian kill in singl us airstrik have been identifi')

In [165]:
if val ==[0]:
    print('The News is Not Reliable')
else:
    print('The News is Reliable')

The News is Reliable


## Test 2

In [166]:
df['title'][5]

'jacki mason: hollywood would love trump if he bomb north korea over lack of tran bathroom (exclus video) - breitbart'

In [167]:
newss = truth('jacki mason: hollywood would love trump if he bomb north korea over lack of tran bathroom (exclus video) - breitbart')

In [168]:
if newss == [0]:
    print('The News is NOT Reliable')
else:
    print('The News is Reliable')

The News is NOT Reliable
