## Importing required libraries


In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import string
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional, Dropout, BatchNormalization, Activation
from tensorflow.keras.models import Model
from pandas import DataFrame
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

## Loading training and testing dataset along with the sample submission dataset

In [None]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
sample = pd.read_csv('sample_submission.csv')

## Combining the features of the training and test dataset (titile, text and date)

In [None]:
X_train = train['title'] + " " + train['text'] + " " + train['date']
X_test = test['title'] + " " + test['text'] + " " + test['date']

## Initializing a few variables and creating padded sequence for training dataset

In [None]:
max_words = 3000
max_len = 512
embed_dim = 100
lstm_out = 256
batch_size = 64
token = Tokenizer(num_words=max_words, lower=True, split=' ')
token.fit_on_texts(X_train.values)
sequences = token.texts_to_sequences(X_train.values)
train_sequences_padded = pad_sequences(sequences, maxlen=max_len)

## Creating a model using and embedded layer and Bidirectional LSTM followed by a couple of dense layers. I have used dropout and batch normalization between the dense layers. The optimizer used is Adamax and the loss function used is binary_crossentropy

In [None]:
model = Sequential()
model.add(Embedding(max_words, embed_dim, input_length = max_len))
model.add(Bidirectional(LSTM(lstm_out)))
model.add(Dense(256))
model.add(Activation('selu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(128))
model.add(Activation('selu'))
model.add(Dropout(0.5))
model.add(Dense(1, name='out_layer'))
model.add(Activation('sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adamax', metrics = ['accuracy'])

## Printing the summary of the mode

In [None]:
print(model.summary())

## Initializing early stopping and model checkpoint callback functions which will be used while training the model

In [None]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=200)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

## Training the model for 10 epochs and using 30% of the training data as validation data

In [None]:
model.fit(train_sequences_padded, train['is_fake'], batch_size=batch_size, epochs = 10, validation_split=0.3, callbacks=[mc, es])

## Converting test data into padded sequences

In [None]:
test_sequences = token.texts_to_sequences(X_test)
test_sequences_padded = pad_sequences(test_sequences,
                                       maxlen=max_len)

## Predicting results for test dataset

In [None]:
res = model.predict(test_sequences_padded)

## Converting the probabilities into labels

In [None]:
res = (model.predict(test_sequences_padded) > 0.5).astype("int")

## Creating a dataframe of test ids and predicted labels

In [None]:
pred_list = DataFrame(res, columns = ['is_fake'])
sample_ids = DataFrame(sample['id'], columns=['id'])
result = pd.concat([sample_ids, pred_list], axis = 1)

## Saving the results as a csv file

In [None]:
result.to_csv('res2.csv', index=False)