In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential

# **Loading Data**

In [None]:
true_data = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_data = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [None]:
print(true_data.shape)
true_data.head()

In [None]:

print(fake_data.shape)
fake_data.head()

In [None]:
fake_data['label'] = 1
true_data['label'] = 0

Here i am introducing new column 'label' which is useful while concating fake_data and true_data

In [None]:
fake_data.head()

In [None]:
data = pd.concat([true_data,fake_data],axis=0,ignore_index=True)

In [None]:
print(data.shape)
data.head()

In [None]:
data.tail()

In [None]:
x=data.copy()

In [None]:
y = x['label']
x = x.drop('label',axis = 1)

In [None]:
x.isnull().sum()

# **Text-Data Preprocessing**

In [None]:
ps = PorterStemmer()
corpus = []
for i in range(len(x['title'])):
    review = re.sub('[^a-zA-Z]', ' ', x['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

* re.sub('[^a-zA-Z]', ' ', x['title'][i]) this line is useful for removing all links,special characters,numbers etc
* review.lower() this line is useful for converting all capital letters to lower letters
* review.split() useful for spliting
* [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] this line is stemming every word and removing stopwords


In [None]:
corpus

using onehot encoding of vocabulary size = 10000

In [None]:
voc_size = 10000
onehot_repr = [one_hot(word,voc_size) for word in corpus]
onehot_repr

in the next cell i am applying pad_sequences so that every sentence become len of 20 and the sentences which are less than 20 have zero's behind them as we have applied padding='pre'

In [None]:
sentlen = 20
embedding_doc = pad_sequences(onehot_repr,padding = 'pre',maxlen=sentlen)
embedding_doc

# **Model Building**

* we are building a sequential model where first we apply word embedding of features = 60 here we use more advanced word embedding like word2vec and GloVe
* BatchNormalization() and Dropout(0.5) are used for avoiding overfitting

In [None]:
embedding_feature = 60
model = Sequential()
model.add(Embedding(voc_size,embedding_feature,input_length=sentlen))
model.add(LSTM(64,return_sequences=True))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(LSTM(32))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(Dense(1,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
x_final = np.array(embedding_doc)


In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_final,y,test_size=0.3,random_state=1)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,batch_size=128,callbacks=[early_stopping])
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")


In [None]:
result = model.evaluate(x_test, y_test)

loss = result[0]
accuracy = result[1]


print(f"[+] Accuracy: {accuracy*100:.2f}%")