## Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
# Run this once in your notebook:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sharm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Load & Label Dataset

In [24]:
# Load datasets
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

# Add labels
fake["label"] = 1   # Fake
true["label"] = 0   # Real

# Combine
news_dataset = pd.concat([fake, true], axis=0)

# Shuffle
news_dataset = news_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
news_dataset.shape

(44898, 5)

In [9]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


## Inspect & Handle Missing Values

In [11]:
news_dataset.isnull().sum()


title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [12]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

## Create Content Column

In [13]:
# merging the author name and news title
news_dataset['content'] = news_dataset['title'] + ' ' + news_dataset['text']



In [14]:
news_dataset['content'].head()


0    Ben Stein Calls Out 9th Circuit Court: Committ...
1    Trump drops Steve Bannon from National Securit...
2    Puerto Rico expects U.S. to lift Jones Act shi...
3     OOPS: Trump Just Accidentally Confirmed He Le...
4    Donald Trump heads for Scotland to reopen a go...
Name: content, dtype: object

In [15]:
# separating the data & label
X = news_dataset['content']
Y = news_dataset['label']

In [16]:
print(X)
print(Y)

0        Ben Stein Calls Out 9th Circuit Court: Committ...
1        Trump drops Steve Bannon from National Securit...
2        Puerto Rico expects U.S. to lift Jones Act shi...
3         OOPS: Trump Just Accidentally Confirmed He Le...
4        Donald Trump heads for Scotland to reopen a go...
                               ...                        
44893    UNREAL! CBS’S TED KOPPEL Tells Sean Hannity He...
44894    PM May seeks to ease Japan's Brexit fears duri...
44895    Merkel: Difficult German coalition talks can r...
44896     Trump Stole An Idea From North Korean Propaga...
44897    BREAKING: HILLARY CLINTON’S STATE DEPARTMENT G...
Name: content, Length: 44898, dtype: object
0        1
1        0
2        0
3        1
4        0
        ..
44893    1
44894    0
44895    0
44896    1
44897    1
Name: label, Length: 44898, dtype: int64


# Stemming:

### Stemming is the process of reducing a word to its Root word

### example:
### actor, actress, acting --> act

In [17]:
port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))

def stemming(content):
    content = re.sub('[^a-zA-Z]', ' ', content)
    content = content.lower()
    words = content.split()
    words = [port_stem.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)


In [18]:
# Apply Stemming

news_dataset['content'] = news_dataset['content'].apply(stemming)

news_dataset['content'].head()


0    ben stein call th circuit court commit coup ta...
1    trump drop steve bannon nation secur council w...
2    puerto rico expect u lift jone act ship restri...
3    oop trump accident confirm leak isra intellig ...
4    donald trump head scotland reopen golf resort ...
Name: content, dtype: object

## TF-IDF Vectorization

In [None]:
X = news_dataset['content'].values
Y = news_dataset['label'].values


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [20]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)


## Evaluation

accuracy score

In [21]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

print('Accuracy score of the training data : ', training_data_accuracy)


Accuracy score of the training data :  0.9912578651372571


In [22]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

print('Accuracy score of the test data : ', test_data_accuracy)


Accuracy score of the test data :  0.9878619153674832


## Making a Predictive System

In [23]:
X_new = X_test[3]

prediction = model.predict(X_new)

if prediction[0] == 0:
    print('The news is Real')
else:
    print('The news is Fake')

print("Actual label:", Y_test[3])


The news is Fake
Actual label: 1
