# Problem Statement

1. We are given a dataset consisting of two csv files train_bodies.csv which contains the set of news articles bodies,while train-stances.csv resembles the articles for each of these bodies being identified using the body id.

2. After training from these samples we need to detect whether the given headline agrees,disagrees,discusses,unrelated with the body id


## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import os
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical,plot_model

from keras.models import Input,Model,Sequential
from keras.layers import LSTM,Embedding,Dropout,Activation,Reshape,Dense,GRU,Add,Flatten,concatenate,Bidirectional

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical,plot_model
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint


# Dataset understanding
The train_bodies contain the entries for the body id and associated article Body
The train_stances contain the entries for the headlines associated with the particular body id and its labelled stance
One body present in train_bodies can have multiple associated headlines present in train_stances and it's corresponding stance label
1683 :- Number of article Body present
49972 number of total headlines present for the 1683 different article body

## Dataset Preparation

**train_bodies.csv** contains body id and article body for training  
**train_stances.csv** contains headlines corresponding to body id and associated labelled stance with it


In [None]:
DATASET_PATH = "../input/fake-news-challenge/"

train_bodies = pd.read_csv(os.path.join(DATASET_PATH,'train_bodies.csv'))
# train_bodies.head()
train_stance = pd.read_csv(os.path.join(DATASET_PATH,'train_stances.csv'))

# Combining the CSV

I am preparing a final csv in each row will correspond to a unique entry
i.e each row will correspond to a unique combination of headline,bodyid and article body 

The above is needed for making simplicity in further data preparation steps we need to execute


In [None]:
# Run commented code to combine the two csv file{train_bodies.csv,train_stances.csv} into data_combined.csv file
from tqdm.notebook import tqdm
count=0
for i in tqdm(range(train_stance.shape[0])):
    for j in range(train_bodies.shape[0]):
        if train_bodies.loc[j,'Body ID']==train_stance.loc[i,'Body ID']:
            train_stance.loc[i,'articleBody'] = train_bodies.loc[j,'articleBody']


train_stance.to_csv(os.path.join(os.getcwd(),'data_combined.csv'),index=False)

In [None]:
data = pd.read_csv(os.path.join(os.getcwd(),'data_combined.csv'))#generated from Fake News stanford.ipynb
data.head()

In [None]:
data['stance_cat'] = data['Stance'].map({'agree':0,'disagree':1,'discuss':2,'unrelated':3}).astype(int)
data['Stance'].value_counts()

In [None]:
corpus = np.r_[data['Headline'].values,data['articleBody'].values]
print(49972*2)
print(len(corpus)) # first 49972 contains the Headline and next 49972 contains the articleBody

vocabulary = []
for sentence in corpus:
    vocabulary.extend(sentence.split(' '))

vocabulary = list(set(vocabulary))
vocab_length = len(vocabulary)
print("Vocabulary Length is {0}".format(vocab_length))


## Model Training Parameters

In [None]:
max_features = 5000
MAX_NB_WORDS = 24000
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 64

## Creating Embedding Matrix For Headline and Body

We create Emebdding Matrix for headline and Body to be served as a first layer of Deep learning Model

In [None]:
GLOVE_DIR = "../input/glove50d/"
def setup_embedding_index():
    embedding_index=dict()
    f = open(os.path.join(GLOVE_DIR,"glove.6B.50d.txt"),encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.array(values[1:],dtype='float32')
        embedding_index[word] = coefs
    f.close()
    return embedding_index
embeddings_index = setup_embedding_index()

## Padding headline and body 

We pad the headline into length of 16 as headline is of shorter length and body into length of 48 as observed best performing parameter for body is 48.

In [None]:
tokenizer_headline = Tokenizer(num_words=max_features, split=' ')
tokenizer_headline.fit_on_texts(data.loc[:,'Headline'].values)
vocab_headline_length = len(tokenizer_headline.word_index)+1

encoded_docs_headline = tokenizer_headline.texts_to_sequences(data.loc[:,'Headline'])
padded_docs_headline = pad_sequences(encoded_docs_headline, maxlen=16, padding='post')

print(vocab_headline_length)
word_index_headline = tokenizer_headline.word_index

NUM_WORDS_HEADLINE = vocab_headline_length

In [None]:
tokenizer_body = Tokenizer(num_words=max_features, split=' ')
tokenizer_body.fit_on_texts(data.loc[:,'articleBody'].values)
vocab_body_length = len(tokenizer_body.word_index)+1

encoded_docs_body = tokenizer_body.texts_to_sequences(data.loc[:,'articleBody'])
padded_docs_body = pad_sequences(encoded_docs_body, maxlen=48, padding='post')

print(vocab_body_length)
word_index_body = tokenizer_body.word_index


NUM_WORDS_BODY = vocab_body_length
print(NUM_WORDS_BODY)

In [None]:

embedding_matrix_headline = np.zeros((NUM_WORDS_HEADLINE, EMBEDDING_DIM))

for word, i in tokenizer_headline.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_headline[i] = embedding_vector
dims = len(embedding_matrix_headline[0])

print(dims)

In [None]:

embedding_matrix_body = np.zeros((NUM_WORDS_BODY, EMBEDDING_DIM))

for word, i in tokenizer_body.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_body[i] = embedding_vector
dims = len(embedding_matrix_body[0])

print(dims)

In [None]:
print(padded_docs_headline.shape)
print(padded_docs_body.shape)

## Model Architecture

In [None]:
input_headline = Input(shape=16,name='input_headline')
embedding_layer_headline = Embedding(input_dim = vocab_headline_length,output_dim = 50,
                                     weights=[embedding_matrix_headline],
                                     input_length = 16,trainable=True)(input_headline)

# lstm_headline = LSTM(units=16)(embedding_layer_headline)

input_body = Input(shape=48,name='input_body')
embedding_layer_body = Embedding(input_dim = vocab_body_length,output_dim = 50,weights = [embedding_matrix_body],
                                 input_length=48,trainable = True)(input_body)
lstm_body = LSTM(units=48)(embedding_layer_body)

addition_layer = concatenate([embedding_layer_headline,embedding_layer_body],axis=1)

# addition_layer = concatenate([lstm_headline,lstm_body],axis=1)
lstm = LSTM(units=64,)(addition_layer)
drop = Dropout(0.25)(lstm)
# dense = Dense(64,activation='relu')(drop)
# flatten = Flatten()(addition_layer)

output = Dense(4,activation='sigmoid')(drop)

model = Model(inputs=[input_headline,input_body],outputs=output)
# from keras.optimizers import SGD
# sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)

# model.compile(loss = "categorical_crossentropy", optimizer = sgd,metrics = ['accuracy'])

model.compile(optimizer = 'adam',loss ='categorical_crossentropy',metrics = ['accuracy'])


In [None]:
model.summary()

In [None]:
plot_model(model, to_file='model_glove_lstm.png', show_shapes=True, show_layer_names=True)

In [None]:
padded_docs_headline_train = padded_docs_headline[:int(len(padded_docs_headline)*0.9),:]
padded_docs_headline_test = padded_docs_headline[int(len(padded_docs_headline)*0.9):,:]

padded_docs_body_train = padded_docs_body[:int(len(padded_docs_body)*0.9),:]
padded_docs_body_test = padded_docs_body[int(len(padded_docs_body)*0.9):,:]

labels = to_categorical(data.loc[:,'stance_cat'])

labels_train = labels[:int(len(labels)*0.9),:]
labels_test = labels[int(len(labels)*0.9):,:]


## Creating Checkpoints 

For saving the latest model trained after every epoch

In [None]:
# MODELS_DIR = os.path.join("/home/abhinav/fake_news_challenge/model/glove_lstm")
filepath = os.path.join(os.getcwd(),"{epoch:02d}-{val_accuracy:.2f}.hdf5")
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

# Model Training

In [None]:
model_history = model.fit([padded_docs_headline_train,padded_docs_body_train],labels_train,epochs=40,shuffle=True,verbose=1,
                          validation_data=([padded_docs_headline_test,padded_docs_body_test],labels_test),
                                          callbacks=[checkpoint])

## Model Training History

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))
ax1.plot(model_history.history['loss'], color='b', label="Training loss")
ax1.plot(model_history.history['val_loss'], color='r', label="validation loss")
ax1.set_xticks(np.arange(1, 40, 1))
ax1.set_yticks(np.arange(0, 1, 0.1))

ax2.plot(model_history.history['accuracy'], color='b', label="Training accuracy")
ax2.plot(model_history.history['val_accuracy'], color='r',label="Validation accuracy")
ax2.set_xticks(np.arange(1, 40, 1))

legend = plt.legend(loc='best', shadow=True)
plt.tight_layout()
plt.show()