# Problem Statement

We are given a dataset consisting of two csv files train_bodies.csv which contains the set of news articles bodies,while train-stances.csv resembles the articles for each of these bodies being identified using the body id.

After training from these samples we need to detect whether the given headline agrees,disagrees,discusses,unrelated with the body id

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import os
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical,plot_model

from keras.models import Input,Model,Sequential
from keras.layers import LSTM,Embedding,Dropout,Activation,Reshape,Dense,GRU,Add,Flatten,concatenate

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences


# Dataset Preparation

In [None]:
# DATASET_PATH = "/home/abhinav/fake_news_challenge/fake_news_challenge/dataset"
DATASET_PATH = "../input/fake-news-challenge/"

train_bodies = pd.read_csv(os.path.join(DATASET_PATH,'train_bodies.csv'))
# train_bodies.head()
train_stance = pd.read_csv(os.path.join(DATASET_PATH,'train_stances.csv'))

In [None]:
#Code to combine the two csv file{train_bodies.csv,train_stances.csv} into data_combined.csv file
from tqdm.notebook import tqdm
count=0
for i in tqdm(range(train_stance.shape[0])):
    for j in range(train_bodies.shape[0]):
        if train_bodies.loc[j,'Body ID']==train_stance.loc[i,'Body ID']:
            train_stance.loc[i,'articleBody'] = train_bodies.loc[j,'articleBody']


train_stance.to_csv('data_combined.csv',index=False)

In [None]:
data = pd.read_csv('data_combined.csv')#generated from Fake News stanford.ipynb

In [None]:
data.head()

In [None]:
data['stance_cat'] = data['Stance'].map({'agree':0,'disagree':1,'discuss':2,'unrelated':3}).astype(int)
data['Stance'].value_counts()

In [None]:
corpus = np.r_[data['Headline'].values,data['articleBody'].values]
print(49972*2)
print(len(corpus)) # first 49972 contains the Headline and next 49972 contains the articleBody

vocabulary = []
for sentence in corpus:
    vocabulary.extend(sentence.split(' '))

vocabulary = list(set(vocabulary))
vocab_length = len(vocabulary)
print("Vocabulary Length is {0}".format(vocab_length))


In [None]:
max_features = 5000
max_nb_words = 24000
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH_HEADLINE = 64
MAX_SEQUENCE_LENGTH_BODY = 64

# BASELINE - ONE HOT ENCODING

In [None]:

encoded_docs_headline = [one_hot(sentence,vocab_length) for sentence in data.loc[:,'Headline'].tolist()]
padded_docs_headline = pad_sequences(encoded_docs_headline,MAX_SEQUENCE_LENGTH_HEADLINE,padding='post')

encoded_docs_body = [one_hot(sentence,vocab_length) for sentence in data.loc[:,'articleBody'].tolist()]
padded_docs_body = pad_sequences(encoded_docs_body,MAX_SEQUENCE_LENGTH_BODY,padding='post')


labels = to_categorical(data.loc[:,'stance_cat'])


In [None]:
padded_docs_headline_train = padded_docs_headline[:int(len(padded_docs_headline)*0.8),:]
padded_docs_headline_test = padded_docs_headline[int(len(padded_docs_headline)*0.8):,:]

padded_docs_body_train = padded_docs_body[:int(len(padded_docs_body)*0.8),:]
padded_docs_body_test = padded_docs_body[int(len(padded_docs_body)*0.8):,:]

labels_train = labels[:int(len(labels)*0.8),:]
labels_test = labels[int(len(labels)*0.8):,:]


# Defining Model Architecture

In [None]:
input_headline = Input(shape=[64],name='input_headline')
embedding_headline = Embedding(vocab_length,50,input_length = MAX_SEQUENCE_LENGTH_HEADLINE)(input_headline)
# dense_headline = Dense(16,activation='relu')(embedding_headline)

input_body = Input(shape=[64],name='input_body')
embedding_body = Embedding(vocab_length,50,input_length = MAX_SEQUENCE_LENGTH_BODY)(input_body)
# dense_body = Dense(16,activation='relu')(embedding_body)

addition_layer = concatenate([embedding_headline,embedding_body])
lstm = LSTM(units=64)(addition_layer)
# drop = Dropout(0.25)(lstm)

# flatten = Flatten()(addition_layer)
output = Dense(4,activation='sigmoid')(lstm)

model_combined = Model(inputs=[input_headline,input_body],outputs=output)

model_combined.compile(optimizer = 'adam',loss ='categorical_crossentropy',metrics = ['accuracy'])


# Model Architecture

In [None]:
model_combined.summary()

In [None]:
plot_model(model_combined, to_file='model_one_hot.png', show_shapes=True, show_layer_names=True)


# Model Training

In [None]:
model_combined.fit([padded_docs_headline_train,padded_docs_body_train],labels_train,epochs=15,verbose=1,validation_data=([padded_docs_headline_test,padded_docs_body_test],labels_test))

**Please upvote the notebook if you find it useful**