## Download dataset

In [14]:
import os
import pandas as pd
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pickle
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from collections import OrderedDict
from keras.preprocessing.text import Tokenizer

import keras
from keras.models import Model 
from keras.layers import Input
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Concatenate
from keras.layers import LSTM

In [2]:
import os
import requests
import zipfile

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

# Uncomment if you neewd to dowload the dataset
#download_data('dataset')

# Preprocessing data


In [3]:
#create a dataframe for the training data 
train_df = pd.read_csv('./dataset/train_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})
train_df

#create a dataframe for the validation data 
val_df = pd.read_csv('./dataset/val_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})
val_df

#create a dataframe for the test data 
test_df = pd.read_csv('./dataset/test_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})
test_df


Unnamed: 0,Claim,Evidence,claimID,Label
0,Anxiety has been linked with physical symptoms.,"13\tFurthermore , anxiety has been linked with...",16387,SUPPORTS
1,Firefox is an application.,0\tMozilla Firefox -LRB- or simply Firefox -RR...,6,SUPPORTS
2,Keegan-Michael Key played President Barack Oba...,"6\tIn 2015 , Key appeared at the White House C...",16392,SUPPORTS
3,Google Search can find stock quotes.,"13\tThese include synonyms , weather forecasts...",16394,SUPPORTS
4,A Good Day to Die Hard was directed solely by ...,1\tThe film was directed by John Moore and wri...,98315,REFUTES
...,...,...,...,...
7184,Scandal is an American band.,0\tScandal is an American rock band from the 1...,16378,SUPPORTS
7185,Henry Cavill played Superman.,8\tCavill gained further prominence and intern...,143046,SUPPORTS
7186,The Africa Cup of Nations is a friendly global...,"0\tThe Africa Cup of Nations , officially CAN ...",16382,REFUTES
7187,Ron Dennis is the owner of a catering company ...,"0\tAbsolute Taste , is a London-based catering...",147455,SUPPORTS


### Tokenize and pad data

In [4]:
# The tokenizer will have an index 1 for OOV words. A lot of words in test and val will be 1.
tokenizer = Tokenizer(oov_token=1)

tokenizer.fit_on_texts(train_df["Claim"])
tokenizer.fit_on_texts(train_df["Evidence"])

In [5]:
MAX_SEQ_LEN = np.max([len(text.split()) for text in train_df["Evidence"]])

In [6]:
def textToTensor(tokenizer, max_len, text):
    seq = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(sequences=seq, maxlen=max_len)
    return padded

In [7]:
claim_train = textToTensor(tokenizer, MAX_SEQ_LEN, train_df["Claim"])
evidence_train = textToTensor(tokenizer, MAX_SEQ_LEN, train_df["Evidence"])

claim_val = textToTensor(tokenizer, MAX_SEQ_LEN, val_df["Claim"])
evidence_val = textToTensor(tokenizer, MAX_SEQ_LEN, val_df["Evidence"])

claim_test = textToTensor(tokenizer, MAX_SEQ_LEN, test_df["Claim"])
evidence_test = textToTensor(tokenizer, MAX_SEQ_LEN, test_df["Evidence"])

In [8]:
claim_train.shape

(121740, 237)

In [11]:
VOCABULARY_LENGTH = len(tokenizer.word_index) + 1
VOCABULARY_LENGTH

35800

In [34]:
y_train = np.array([1 if x=='SUPPORTS' else 0 for x in train_df["Label"]])

In [35]:
type(y_train)

numpy.ndarray

In [36]:
def create_model(batch_size, max_tokens, vocab_length, embedding_dimension):    
    claims_input = Input(shape=(max_tokens, ), batch_size= batch_size)
    evidence_input = Input(shape=(max_tokens, ), batch_size= batch_size)
    
    claims_embedding = Embedding(vocab_length, embedding_dimension)(claims_input)
    evidence_embedding = Embedding(vocab_length, embedding_dimension)(evidence_input)
    
    claims_sentence_embedding = LSTM(embedding_dimension)(claims_embedding)
    evidence_sentence_embedding = LSTM(embedding_dimension)(evidence_embedding)
    
    merged = Concatenate(axis=1)([claims_sentence_embedding, evidence_sentence_embedding])

    out = (Dense(1, activation='sigmoid'))(merged)
    
    model = Model(inputs=[claims_input, evidence_input], outputs=[out])
    
    return model

In [37]:
model = create_model(100, MAX_SEQ_LEN, VOCABULARY_LENGTH, 50)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(100, 237)]         0           []                               
                                                                                                  
 input_8 (InputLayer)           [(100, 237)]         0           []                               
                                                                                                  
 embedding_6 (Embedding)        (100, 237, 50)       1790000     ['input_7[0][0]']                
                                                                                                  
 embedding_7 (Embedding)        (100, 237, 50)       1790000     ['input_8[0][0]']                
                                                                                            

In [None]:
model.compile(loss='mse',
              optimizer='Adam',
              metrics=['acc'])
model.fit(x=[claim_train, evidence_train], y=y_train, batch_size=100, epochs=10)

Epoch 1/10
  32/1218 [..............................] - ETA: 5:59 - loss: 0.1281 - acc: 0.8353