In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
home = '/kaggle/input/fake-and-real-news-dataset/'

In [None]:
true_data = pd.read_csv(home+'True.csv')
fake_data = pd.read_csv(home+'Fake.csv')

In [None]:
true_data.shape,fake_data.shape

In [None]:
print('True Data:')
display(true_data.head())
print('Fake Data:')
display(fake_data.head())

In [None]:
true_data['label'] = 0
fake_data['label'] = 1

In [None]:
train_data = pd.concat([true_data,fake_data],axis=0).sample(frac=0.3).reset_index(drop=True)
print(train_data.label.value_counts())
train_data.head()

In [None]:
from keras.preprocessing import sequence, text
from sklearn.model_selection import train_test_split

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train_data['text'].values, train_data['label'].values, 
                                                  random_state=42, 
                                                  test_size=0.8, shuffle=True)

In [None]:
print(ytrain.shape,yvalid.shape)
# print(xvalid.label.value_counts())

In [None]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 1000

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
import tensorflow as tf
strategy = tf.distribute.get_strategy()

In [None]:
len(word_index)

In [None]:
##RNN
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Activation, Dropout

%time
with strategy.scope():
    # A simpleRNN without any pretrained embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.summary()

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(True)

In [None]:
history = model.fit(xtrain_pad, ytrain, epochs=3, batch_size=32,validation_data=(xvalid_pad, yvalid)) #Multiplying by Strategy to run on TPU's

In [None]:
from sklearn import metrics
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

In [None]:
##Inference
idx = 23
original = train_data['label'][idx]
print('Original: ',original)

news = train_data['text'][idx]
# news = 'Tsunami hit Asia'
news = np.array(news).reshape(1,)
xtest_seq = token.texts_to_sequences(news)
xtest_pad = sequence.pad_sequences(xtest_seq, maxlen=max_len)
fake_prob = model.predict(xtest_pad)
isfake = round(fake_prob[0][0])
print('Fake: ', isfake)
print('Prob: ', fake_prob[0][0])

In [None]:
##Word Embedding - Optional

# embeddings_index = {}
# f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
# for line in tqdm(f):
#     values = line.split(' ')
#     word = values[0]
#     coefs = np.asarray([float(val) for val in values[1:]])
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors.' % len(embeddings_index))

# embedding_matrix = np.zeros((len(word_index) + 1, 300))
# for word, i in tqdm(word_index.items()):
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

In [None]:
%%time
with strategy.scope():
    
    # A simple LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len
                     ))

    model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(2, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model.summary()

In [None]:
model.fit(xtrain_pad, ytrain, epochs=3, batch_size=32,validation_data=(xvalid_pad, yvalid))

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

In [None]:
%%time
with strategy.scope():
    # GRU with glove embeddings and two dense layers
     model = Sequential()
     model.add(Embedding(len(word_index) + 1,
                     300,
                     
                     input_length=max_len,
                     ))
#      model.add(SpatialDropout1D(0.3))
     model.add(GRU(300))
     model.add(Dense(1, activation='sigmoid'))

     model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    
model.summary()

In [None]:
history = model.fit(xtrain_pad, ytrain, epochs=3, batch_size=32,validation_data=(xvalid_pad, yvalid))

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

In [None]:
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
%time
with strategy.scope():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     
                     input_length=max_len
                     ))
    model.add(Bidirectional(LSTM(300)))

    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    
model.summary()

In [None]:
history = model.fit(xtrain_pad, ytrain, epochs=3, batch_size=32,validation_data=(xvalid_pad, yvalid))

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))