In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import numpy as np
import pandas as pd

import nltk
import re
import string 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import keras
from keras.preprocessing import text,sequence
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional

dataset = pd.read_csv(r'C:/Users/SHEEL/Desktop/Projects/Fake-News-Detector-Application-1/final_data.csv')

In [10]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download("punkt")
nltk.download('wordnet')

def remove_punctuations(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_characters(text):
    return re.sub("[^a-zA-Z]"," ",text)
 
def remove_stopwords_and_lemmatization(text):
    final_text = []
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    for word in text:
        if word not in set(stopwords.words('english')):
            lemma = nltk.WordNetLemmatizer()
            word = lemma.lemmatize(word) 
            final_text.append(word)
    return " ".join(final_text)

def cleaning(text):
    text = remove_punctuations(text)
    text = remove_characters(text)
    text = remove_stopwords_and_lemmatization(text)
    return text

#Apply function on text column
dataset['title']=dataset['title'].apply(cleaning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHEEL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SHEEL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SHEEL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
X_train, X_test, y_train, y_test = train_test_split(dataset['title'], dataset['label'], random_state=42)

In [12]:
max_features =5000
maxlen = 150
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [13]:

tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [14]:
embedding_vector_features = 70
model=Sequential()
model.add(Embedding(max_features,embedding_vector_features,input_length=maxlen, trainable=True))
model.add(LSTM(50,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100,return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dense(units = 50 , activation = 'relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 70)           350000    
_________________________________________________________________
lstm (LSTM)                  (None, 150, 50)           24200     
_________________________________________________________________
dropout (Dropout)            (None, 150, 50)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 100)          60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense (Dense)                (None, 50)                3

In [16]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=256,shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20

KeyboardInterrupt: 

In [38]:
print("Accuracy of the model on Training Data is - " , model.evaluate(X_train,y_train)[1]*100 , "%")
print("Accuracy of the model on Testing Data is - " , model.evaluate(X_test,y_test)[1]*100 , "%")

Accuracy of the model on Training Data is -  99.87464547157288 %
Accuracy of the model on Testing Data is -  92.291259765625 %


In [39]:
model.save('model_15-6_2.h5')

In [41]:
import joblib
joblib.dump(tokenizer, 'tokenizer.pkl')

['/content/drive/My Drive/Fake News/Project/Saved_model/tokenizer.pkl']