### Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# downloading the package
# Stop words are a set of commonly used words in any language.
# In NLP and text mining applications, stop words are used to eliminate unimportant words, allowing applications to focus on the important words instead.
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Priya_Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Data Pre-processing

In [4]:
# loading the dataset using pandas dataframe
news_dataset=pd.read_csv('FakeNewsNet.csv')

In [5]:
# name of the columns
news_dataset.columns

Index(['title', 'news_url', 'source_domain', 'tweet_num', 'real'], dtype='object')

#### Dataset Description
##### title - it tells about the title of news 
##### news_url - url of the news
##### source_domain - source of the news article
##### tweet_num - no. of times the data is retweeted 
##### real - labels the data 1 means real and 0 means fake

In [6]:
# shape of the dataset
news_dataset.shape
# we have 23196 news articles and 5 featues

(23196, 5)

In [7]:
# types of the column
news_dataset.dtypes

title            object
news_url         object
source_domain    object
tweet_num         int64
real              int64
dtype: object

In [8]:
# printing the first 5 dataset values or dataframe
news_dataset.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [9]:
# renaming the column
news_dataset.rename(columns={'real':'label'},inplace=True)

In [10]:
news_dataset.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,label
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [11]:
#counting the missing values in each columns
news_dataset.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
label              0
dtype: int64

In [12]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna("")

In [13]:
# merging the author name and news title
news_dataset['content'] = news_dataset['title']+' '+news_dataset['source_domain']

In [14]:
print(news_dataset['content'])

0        Kandi Burruss Explodes Over Rape Accusation on...
1        People's Choice Awards 2018: The best red carp...
2        Sophia Bush Sends Sweet Birthday Message to 'O...
3        Colombian singer Maluma sparks rumours of inap...
4        Gossip Girl 10 Years Later: How Upper East Sid...
                               ...                        
23191    Pippa Middleton wedding: In case you missed it...
23192    Zayn Malik & Gigi Hadid’s Shocking Split: Why ...
23193    Jessica Chastain Recalls the Moment Her Mother...
23194    Tristan Thompson Feels "Dumped" After Khloé Ka...
23195    Kelly Clarkson Performs a Medley of Kendrick L...
Name: content, Length: 23196, dtype: object


In [15]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [16]:
print(X)
print(Y)

                                                   title  \
0      Kandi Burruss Explodes Over Rape Accusation on...   
1      People's Choice Awards 2018: The best red carp...   
2      Sophia Bush Sends Sweet Birthday Message to 'O...   
3      Colombian singer Maluma sparks rumours of inap...   
4      Gossip Girl 10 Years Later: How Upper East Sid...   
...                                                  ...   
23191  Pippa Middleton wedding: In case you missed it...   
23192  Zayn Malik & Gigi Hadid’s Shocking Split: Why ...   
23193  Jessica Chastain Recalls the Moment Her Mother...   
23194  Tristan Thompson Feels "Dumped" After Khloé Ka...   
23195  Kelly Clarkson Performs a Medley of Kendrick L...   

                                                news_url  \
0      http://toofab.com/2017/05/08/real-housewives-a...   
1      https://www.today.com/style/see-people-s-choic...   
2      https://www.etonline.com/news/220806_sophia_bu...   
3      https://www.dailymail.co.uk/news

### Stemming:
#### Stemming is the process of reducing a word to its Root word
#### example: actor, actress, acting --> act

In [17]:
port_stem = PorterStemmer()

In [18]:
# applying stemming to title column
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [19]:
#applying the stemming procedure
# news_dataset['title']=news_dataset['title'].apply(stemming)
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [20]:
print(news_dataset['content'])

0        kandi burruss explod rape accus real housew at...
1        peopl choic award best red carpet look www tod...
2        sophia bush send sweet birthday messag one tre...
3        colombian singer maluma spark rumour inappropr...
4        gossip girl year later upper east sider shock ...
                               ...                        
23191    pippa middleton wed case miss pippa marri lace...
23192    zayn malik gigi hadid shock split chanc reunit...
23193    jessica chastain recal moment mother boyfriend...
23194    tristan thompson feel dump khlo kardashian ref...
23195    kelli clarkson perform medley kendrick lamar h...
Name: content, Length: 23196, dtype: object


In [21]:
# separating the data and labels
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [22]:
print(X)
print(Y)

['kandi burruss explod rape accus real housew atlanta reunion video toofab com'
 'peopl choic award best red carpet look www today com'
 'sophia bush send sweet birthday messag one tree hill co star hilari burton breyton eva www etonlin com'
 ...
 'jessica chastain recal moment mother boyfriend slap kick genit www justjar com'
 'tristan thompson feel dump khlo kardashian refus let move la home exclus www intouchweekli com'
 'kelli clarkson perform medley kendrick lamar humbl hit billboard music award www billboard com']
[1 1 1 ... 1 0 1]


In [23]:
# converting textual data to numerical data
# tf stands for term frequency and idf stands for inverse document frequency
# It finds out the number of times a particular words repeating in the document
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [24]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 246417 stored elements and shape (23196, 14435)>
  Coords	Values
  (0, 59)	0.2597097788538541
  (0, 651)	0.30737565657585053
  (0, 1707)	0.38749399933436923
  (0, 2460)	0.0508218665886508
  (0, 4193)	0.35506164978733906
  (0, 5891)	0.26124620267491605
  (0, 6647)	0.38749399933436923
  (0, 10223)	0.302158398468634
  (0, 10276)	0.23128301802698967
  (0, 10569)	0.2543303205934995
  (0, 12980)	0.2946734912504806
  (0, 13681)	0.21108934405813046
  (1, 729)	0.3083771474680274
  (1, 1101)	0.33844294892026705
  (1, 1916)	0.3875096512688879
  (1, 2219)	0.42251489448694257
  (1, 2460)	0.08159478582438673
  (1, 7432)	0.33284454859410584
  (1, 9476)	0.23697414575567807
  (1, 10347)	0.37702288594702094
  (1, 12946)	0.36817044163275126
  (1, 14274)	0.10189043319535993
  (2, 1201)	0.19486226938688445
  (2, 1530)	0.3779018214766305
  (2, 1711)	0.3321987353251551
  :	:
  (23194, 4387)	0.2692478125672807
  (23194, 5800)	0.2632849856372537
  (

### Cross Validation  

In [25]:
x_train , x_test , y_train , y_test = train_test_split(X,Y,test_size=0.2,random_state=2,stratify=Y)

## Modeling the Data

In [26]:
# applying Logistic Regression
model = LogisticRegression()

In [27]:
# fitting the training data
model.fit(x_train,y_train)

### Evaluation and Accuracy score 

In [28]:
# accuracy score of training data
x_train_pred=model.predict(x_train)
train_data_accuracy=accuracy_score(x_train_pred,y_train)
print('Accuracy score of the training data : ', train_data_accuracy)

Accuracy score of the training data :  0.8827333477042466


In [29]:
# accuracy score of testing data
x_test_pred=model.predict(x_test)
test_data_accuracy=accuracy_score(x_test_pred,y_test)
print('Accuracy score of the testing data : ', test_data_accuracy)

Accuracy score of the testing data :  0.8525862068965517


### Making the Predictive system 

In [30]:
x_new=x_test[1]

prediction = model.predict(x_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Fake')
else:
  print('The news is Real')

[0]
The news is Fake


In [31]:
print(y_test[1])

0


In [32]:
# applying random forest classification 
from sklearn.ensemble import RandomForestClassifier

In [34]:
# making classifier and fitting the data
classifier = RandomForestClassifier(n_estimators=100, criterion='gini')
classifier.fit(x_train, y_train)

In [36]:
classifier.score(x_test, y_test)

0.85625

In [37]:
x_new=x_test[1]

prediction = classifier.predict(x_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Fake')
else:
  print('The news is Real')

[0]
The news is Fake


In [38]:
print(y_test[1])

0


In [39]:
# applying decision tree classification 
from sklearn.tree import DecisionTreeClassifier

In [43]:
# fitting the model
dtclassifier = DecisionTreeClassifier(criterion='gini')
dtclassifier.fit(x_train,y_train)

In [44]:
# checking the score
dtclassifier.score(x_test,y_test)

0.8051724137931034

In [45]:
# applying naive bayes algorithm
from sklearn.naive_bayes import GaussianNB

In [49]:
# fitting the model
nbclassifier = GaussianNB()
nbclassifier.fit(x_train.toarray(), y_train)

In [51]:
# checking the score
classifier.score(x_test, y_test)

0.85625

In [53]:
# creating a pickle file using serialization
import pickle
pickle_out = open("classifier.pkl","wb")
pickle.dump(classifier,pickle_out)
pickle_out.close()