In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
news_data_true = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
news_data_true.head() ## load the true dataset

In [None]:
news_data_true.shape ## check shape of the data

In [None]:
news_data_true.info() ## check info of the column

* No null values in the dataset.

In [None]:
news_data_true.drop('date',axis=1,inplace=True) ## drop date column from data

In [None]:
news_data_true['subject'].value_counts(normalize=True) ## check type and quantity of news 

In [None]:
news_data_true['status'] = 0 ## add column as status indicates whether a news is fake or true

In [None]:
news_data_fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')
news_data_fake.head() ## load fake news dataset

In [None]:
news_data_fake.shape ## check the shape of the fake dataset

In [None]:
news_data_fake.info() ## check info about fake dataset

In [None]:
news_data_fake.drop('date',axis=1,inplace=True) ## remove date column

In [None]:
news_data_fake['subject'].value_counts(normalize=True) ## check type and quantity of news 

In [None]:
news_data_fake['status'] = 1 ## add column as status indicates whether a news is fake or true

In [None]:
news_data = pd.concat([news_data_true,news_data_fake],axis=0,ignore_index=True) ## concatenate two datasets

In [None]:
news_data ## check the final dataset

In [None]:
import sklearn
from sklearn.utils import shuffle

news_data_final = shuffle(news_data) ## shuffle all the datapoints

In [None]:
news_data_final ## check our shuffled dataset

In [None]:
import nltk
nltk.download('stopwords',download_dir='./')

In [None]:
nltk.data.path

In [None]:
import tensorflow as tf
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
#NLTK_DATA='/usr/share/nltk_data'

# Initialize the stopwords
stoplist = stopwords.words('english')

vocab_size = 10000
corpus = []
ps = PorterStemmer() ## used for standardization of words

for i in range(0,len(news_data_final)):
    message = re.sub('[^a-zA-Z]',' ',news_data_final['title'][i]) ## except a-z and A-Z all charectors will be replaced by space
    message = message.lower() ## then make every words in lower
    message = message.split() ## then split every words in list
    
    message = [ps.stem(word) for word in message if not word in stoplist] ## use stem to stem words which are not in stoplist
    message = ' '.join(message) ## then join those words to make the full sentence as a list 
    corpus.append(message) ## then append all the sentences in the corpus



In [None]:
corpus ## check the corpus of sentences

In [None]:
import tensorflow.keras ## import keras library
from keras.preprocessing.text import one_hot ## ued one hot encoding for our words

onehot_rep = [one_hot(word,vocab_size) for word in corpus] ## index size of each word in 10k size vocabulary vector

In [None]:
onehot_rep ## index representation from 1ok vocab of vector representation of our sentences

In [None]:
from keras.preprocessing.sequence import pad_sequences ## import library for pre proccessing

embedded_data = pad_sequences(onehot_rep,maxlen=20,padding='pre') ## done pre padding to make all the sequences of equal length

In [None]:
embedded_data ## done padding to make each sequence same lengths

In [None]:
## model desing 

from tensorflow.keras.layers import Embedding,Dense,LSTM,Dropout
from tensorflow.keras.models import Sequential
#from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(vocab_size,embedding_vector_features,input_length=20)) ## embedding layer used 40 features to make feature representation of the sentences
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3)) ## drop out layer to make the model more regularized
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary()) ## model architechture done

In [None]:
X_final=np.array(embedded_data) ## store our X and y
y_final=np.array(news_data['status']) 

In [None]:
X_final.shape,y_final.shape


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, shuffle=True)

In [None]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10, verbose=2,batch_size=8)

In [None]:
y_pred=model.predict_classes(X_test) ## store the prediction


In [None]:
from sklearn.metrics import confusion_matrix ## import confusion matrix


In [None]:
confusion_matrix(y_test,y_pred) ## check confusion matrix


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred) ## check test data prediction

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test,y_pred)

print(report) ## check classification report

* This is our final result.