In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df_fake=pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')
df_true=pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')

### Data PreProcessing

In [None]:
# setting label as 0 for fake
df_fake['label']=0
df_fake.head()

In [None]:
# setting label 1 for true
df_true['label']=1
df_true.head()

In [None]:
# merging both fake and true datasets
df=pd.concat([df_fake,df_true])

In [None]:
df=df.sample(frac=1)
df.reset_index(inplace=True)

In [None]:
df.drop('index',axis=1,inplace=True)

In [None]:
df.isnull().sum()

### Exploratory Data Analysis

In [None]:
import seaborn as sns
sns.set_style('darkgrid')

In [None]:
df['label'].value_counts().plot.bar(color=['blue','red'])

In [None]:
df['subject'].unique()

In [None]:
df['subject'].value_counts().plot.bar(color=['red','blue','green','pink','orange','yellow','cyan','black'])

### Text Cleaning

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
y=df['label']
x=df.drop('label',axis=1)

In [None]:
news=x.copy()

In [None]:
news['title'][0]

In [None]:
import nltk
import re
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.stem import WordNetLemmatizer
corpus=[]
for i in range(len(news)):
    # removing everything except alphabets
    text=re.sub('[^a-zA-Z]',' ',news['title'][i])
    text=text.lower()
    text_words=text.split()
    stem_words=[WordNetLemmatizer().lemmatize(word) for word in text_words if word not in stopwords.words('english') ]
    text=' '.join(stem_words)
    corpus.append(text)

In [None]:
corpus[:5]

### Word Embedding

In [None]:
# one-hot representation  of our corpus
vocab_size=50000
onehot_corpus=[one_hot(words,vocab_size) for words in corpus]
onehot_corpus[:5]

In [None]:
# padding every one-hot encoded sentence in corpus to make them of equal length
max_length=20
embedded_docs=pad_sequences(onehot_corpus,maxlen=max_length,padding='pre')
embedded_docs[:5]

In [None]:
features=50
model=Sequential()
# Embedding layer
model.add(Embedding(vocab_size,features,input_length=max_length))
# Long Short Term Memory layer
model.add(LSTM(100))
# Output layer
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
x=np.array(embedded_docs)
x.shape

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

### Model Training

In [None]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,batch_size=64)

### Model Evaluation

In [None]:
y_pred=model.predict_classes(x_test)
y_pred[:5]

In [None]:
y_test[:5]

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_pred,y_test))

In [None]:
sns.heatmap(confusion_matrix(y_pred,y_test),annot=True)