#### Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

import warnings
warnings.filterwarnings("ignore")

#### Loading the Dataset

In [2]:
dataset = pd.read_csv("train.csv/train.csv")

#### checking top 5 documents

In [3]:
dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
dataset["title"][2]

'Why the Truth Might Get You Fired'

#### Checking the unique values for Label

In [5]:
dataset["label"].unique()

array([1, 0], dtype=int64)

#### Checking the shape of the dataset

In [6]:
dataset.shape

(20800, 5)

#### checking for the null values

In [7]:
dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

#### Removing the NAN Values from the dataset

In [8]:
dataset = dataset.dropna()

In [9]:
dataset.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [10]:
dataset.shape

(18285, 5)

#### Creating a new datset with "title" and "label" columns

In [11]:
df = dataset[["title", "label"]]

In [12]:
df.head()

Unnamed: 0,title,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [13]:
df.shape

(18285, 2)

In [14]:
df.isnull().sum()

title    0
label    0
dtype: int64

In [15]:
df["title"][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18285 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   18285 non-null  object
 1   label   18285 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


#### Import nltk library and getting all the stopwords from English

In [17]:
import nltk

In [18]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [19]:
from nltk.corpus import stopwords

In [20]:
stop_words = print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#### converting all the text to lower case

In [21]:
df["title"] = df["title"].apply(lambda x:x.lower())

In [22]:
df["title"][0]

'house dem aide: we didn’t even see comey’s letter until jason chaffetz tweeted it'

#### removing all the punctuations and symbols

In [23]:
df["title"] = df["title"].apply(lambda x:re.sub('[^a-zA-Z]', " ", x))

In [24]:
df["title"][0]

'house dem aide  we didn t even see comey s letter until jason chaffetz tweeted it'

#### loading the stopwords

In [25]:
stop_words = set(stopwords.words("english"))

#### removing the stop words from the dataset

In [26]:
df["title"] = df["title"].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))

#### tokenizing our document

In [27]:
max_features = 5000
tokenizer = Tokenizer(num_words = max_features, split = ' ')
tokenizer.fit_on_texts(df["title"].values)

In [28]:
print(df["title"][0])

house dem aide even see comey letter jason chaffetz tweeted


In [29]:
print(df["title"][1])

flynn hillary clinton big woman campus breitbart


#### converting our tokens or text to sequences

In [30]:
seq = tokenizer.texts_to_sequences(df["title"].values)

In [31]:
print(seq[0])

[24, 472, 883, 215, 166, 65, 515, 1773, 3358]


In [32]:
print(seq[1])

[705, 6, 7, 74, 95, 971, 5]


In [33]:
print(len(seq[0]))

9


#### Doing the padding operation so that all the documents are of same length 


In [34]:
seq = pad_sequences(seq)

In [35]:
print(seq[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0   24  472  883  215  166   65  515 1773 3358]


In [36]:
print(seq[1])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0 705   6   7  74  95
 971   5]


In [37]:
seq.shape

(18285, 38)

In [38]:
y = df["label"]

In [39]:
y[0]

1

In [40]:
y.shape

(18285,)

In [41]:
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 18285, dtype: int64

#### splitting the data into train and test

In [42]:
X_train, X_test, y_train, y_test = train_test_split(seq, y, test_size = 0.20, random_state = 0)

In [43]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(14628, 38) (3657, 38) (14628,) (3657,)


#### giving the embedding dimension, the size of the dimension which we want to convert our word into

In [44]:
embed_dimension = 128

#### Building the model 

In [45]:
model = Sequential()
model.add(Embedding(max_features, embed_dimension,input_length = seq.shape[1]))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))

In [46]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 38, 128)           640000    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 771,713
Trainable params: 771,713
Non-trainable params: 0
_________________________________________________________________


#### compiling the model

In [47]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ["accuracy"])

#### training our model and checking the validation accuracy

In [48]:
model.fit(X_train, y_train, validation_data = (X_test, y_test), batch_size =100, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2443a4c19a0>

#### Performance Metrics and Accuracy

In [49]:
y_pred = model.predict(X_test)
y_pred = (y_pred>0.5)

In [50]:
y_pred

array([[ True],
       [ True],
       [ True],
       ...,
       [False],
       [False],
       [ True]])

In [51]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [52]:
print("Confuion Matrix: ", confusion_matrix(y_pred, y_test))
print("Accuracy Score", accuracy_score(y_pred,y_test))

Confuion Matrix:  [[1916  162]
 [ 124 1455]]
Accuracy Score 0.9217938200710966


In [53]:
from sklearn.metrics import classification_report
print("Classification Report: ", classification_report(y_test, y_pred))

Classification Report:                precision    recall  f1-score   support

           0       0.92      0.94      0.93      2040
           1       0.92      0.90      0.91      1617

    accuracy                           0.92      3657
   macro avg       0.92      0.92      0.92      3657
weighted avg       0.92      0.92      0.92      3657



In [54]:
from tensorflow.keras.models import load_model

model.save('model_fake_news.h5')

In [81]:
dataset.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0
10,10,Obama’s Organizing for Action Partners with So...,Aaron Klein,"Organizing for Action, the activist group that...",0
11,11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",Chris Tomlinson,The BBC produced spoof on the “Real Housewives...,0


In [132]:
dataset["title"][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [131]:
dataset["label"][0]

1

In [136]:
x = ["The world is scared of china"]
from nltk import word_tokenize
nltk.download('punkt')
tokenizer.fit_on_texts(x)
filtered_x = []
for word in x:
  if word not in stop_words:
    filtered_x.append(word)
seq1 = tokenizer.texts_to_sequences(filtered_x)
seq1 = pad_sequences(seq1, maxlen = 22)
y_pred = model.predict_classes(seq1)
y_pred

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sambi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


array([[1]])