## Download dataset

In [16]:
import os
import pandas as pd
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pickle
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from collections import OrderedDict
from keras.preprocessing.text import Tokenizer
from keras import backend as K
import tensorflow as tf

import keras
from keras.models import Model 
from keras.layers import Input
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Concatenate
from keras.layers import LSTM
from keras.layers import Add
from keras.layers import Average
from keras.layers import Reshape
from keras.layers import Flatten

In [3]:
import os
import requests
import zipfile

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

# Uncomment if you neewd to dowload the dataset
#download_data('dataset')

# Preprocessing data


In [4]:
#create a dataframe for the training data 
#train_df = pd.read_csv('./dataset/train_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})
#train_df

train_df = pd.read_csv('./dataset/train_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})
train_df["Evidence"] = train_df["Evidence"].str.split(pat = "\t")
train_df["evidenceID"]=train_df["Evidence"].str[0]
train_df["Evidence"]=train_df["Evidence"].str[1]
train_df["Label"] = train_df["Label"].replace({'SUPPORTS': 1, 'REFUTES': 0})

train_df

#create a dataframe for the validation data 
val_df = pd.read_csv('./dataset/val_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})
val_df

#create a dataframe for the test data 
test_df = pd.read_csv('./dataset/test_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})


Unnamed: 0,Claim,Evidence,claimID,Label
0,Anxiety has been linked with physical symptoms.,"13\tFurthermore , anxiety has been linked with...",16387,SUPPORTS
1,Firefox is an application.,0\tMozilla Firefox -LRB- or simply Firefox -RR...,6,SUPPORTS
2,Keegan-Michael Key played President Barack Oba...,"6\tIn 2015 , Key appeared at the White House C...",16392,SUPPORTS
3,Google Search can find stock quotes.,"13\tThese include synonyms , weather forecasts...",16394,SUPPORTS
4,A Good Day to Die Hard was directed solely by ...,1\tThe film was directed by John Moore and wri...,98315,REFUTES
...,...,...,...,...
7184,Scandal is an American band.,0\tScandal is an American rock band from the 1...,16378,SUPPORTS
7185,Henry Cavill played Superman.,8\tCavill gained further prominence and intern...,143046,SUPPORTS
7186,The Africa Cup of Nations is a friendly global...,"0\tThe Africa Cup of Nations , officially CAN ...",16382,REFUTES
7187,Ron Dennis is the owner of a catering company ...,"0\tAbsolute Taste , is a London-based catering...",147455,SUPPORTS


In [None]:
y_train = np.array(train_df['Label'])
y_val = np.array(val_df['Label'])
y_test = np.array(test_df['Label'])

### Tokenize and pad data

In [5]:
# The tokenizer will have an index 1 for OOV words. A lot of words in test and val will be 1.
tokenizer = Tokenizer(oov_token=1)

tokenizer.fit_on_texts(train_df["Claim"])
tokenizer.fit_on_texts(train_df["Evidence"])

In [6]:
MAX_SEQ_LEN = np.max([len(text.split()) for text in train_df["Evidence"]])

In [7]:
def textToTensor(tokenizer, max_len, text):
    seq = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(sequences=seq, maxlen=max_len)
    return padded

In [8]:
claim_train = textToTensor(tokenizer, MAX_SEQ_LEN, train_df["Claim"])
evidence_train = textToTensor(tokenizer, MAX_SEQ_LEN, train_df["Evidence"])

claim_val = textToTensor(tokenizer, MAX_SEQ_LEN, val_df["Claim"])
evidence_val = textToTensor(tokenizer, MAX_SEQ_LEN, val_df["Evidence"])

claim_test = textToTensor(tokenizer, MAX_SEQ_LEN, test_df["Claim"])
evidence_test = textToTensor(tokenizer, MAX_SEQ_LEN, test_df["Evidence"])

In [9]:
claim_train.shape

(121740, 237)

In [10]:
VOCABULARY_LENGTH = len(tokenizer.word_index) + 1
VOCABULARY_LENGTH

35800

In [22]:
y_train = np.array([1 if x=='SUPPORTS' else 0 for x in train_df["Label"]])
y_test = np.array([1 if x=='SUPPORTS' else 0 for x in test_df["Label"]])

In [12]:
type(y_train)

numpy.ndarray

In [17]:
def create_model(max_tokens, vocab_length, embedding_dimension, sentence_embedding_type=1):    
    claims_input = Input(shape=(max_tokens, ))
    evidence_input = Input(shape=(max_tokens, ))
    
    claims_embedding = Embedding(vocab_length, embedding_dimension, name='WordEmbedding_claims')(claims_input)
    evidence_embedding = Embedding(vocab_length, embedding_dimension, name='WordEmbedding_evidence')(evidence_input)
    if sentence_embedding_type==1:
        # Encode token sequences via a RNN and take the last state as the sentence embedding.
        claims_sentence_embedding = LSTM(embedding_dimension, return_sequences=False, name='SentenceEmbedding_claims')(claims_embedding)
        evidence_sentence_embedding = LSTM(embedding_dimension, return_sequences=False, name='SentenceEmbedding_evidence')(evidence_embedding)
    elif sentence_embedding_type==2:
        # Encode token sequences via a RNN and average all the output states.

        # Put return_sequences True to get output from all hidden states
        claims_sentence_embedding = LSTM(embedding_dimension, return_sequences=True, name='SentenceEmbedding_claims')(claims_embedding)
        evidence_sentence_embedding = LSTM(embedding_dimension, return_sequences=True, name='SentenceEmbedding_evidence')(evidence_embedding)
        
        # Take average of outputs
        claims_sentence_embedding = K.mean(claims_sentence_embedding, axis=1)
        evidence_sentence_embedding = K.mean(evidence_sentence_embedding, axis=1)

    elif sentence_embedding_type==3:
        # Encode token sequences via a simple MLP layer.
        num = embedding_dimension*max_tokens
        # Reshape 3d vector to 2d
        claims_sentence_embedding = Reshape((num,), input_shape=(max_tokens, embedding_dimension))(claims_embedding)
        evidence_sentence_embedding = Reshape((num,), input_shape=(max_tokens, embedding_dimension))(evidence_embedding)

        # Send into dense layer
        claims_sentence_embedding = Dense(num, name='SentenceEmbedding_claims')(claims_sentence_embedding)
        evidence_sentence_embedding = Dense(num, name='SentenceEmbedding_evidence')(evidence_sentence_embedding)
   
    elif sentence_embedding_type==4:
        # Compute the sentence embedding as the mean of its token embeddings (bag of vectors).
        claims_sentence_embedding = K.mean(claims_embedding, axis=1)
        evidence_sentence_embedding = K.mean(evidence_embedding, axis=1)
    else:
        raise Exception("Sentence embedding type must be an integer between 1 and 4")
    #Concatenation
    merged = Concatenate(axis=1)([claims_sentence_embedding, evidence_sentence_embedding])
    
    #Sum
    #merged = Add([claims_sentence_embedding, evidence_sentence_embedding])
    
    #Mean
    #merged = Average(axis=1)([claims_sentence_embedding, evidence_sentence_embedding])

    out = (Dense(1, activation='sigmoid'))(merged)

    model = Model(inputs=[claims_input, evidence_input], outputs=[out])
    
    return model

In [18]:
model = create_model(MAX_SEQ_LEN, VOCABULARY_LENGTH, 50)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 237)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 237)]        0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 237, 50)      1790000     ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 237, 50)      1790000     ['input_4[0][0]']                
                                                                                            

#### Defining recall, precision, f1

In [19]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))



In [20]:
model.compile(loss='mse',
              optimizer='Adam',
              metrics=['acc',f1_m, precision_m, recall_m])
history = model.fit(x=[claim_train, evidence_train], y=y_train, batch_size=100, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Prediction

In [None]:
def predict(model: keras.Model, x, predicting_info):
    predictions = model.predict(x, **predicting_info)
    return predictions

## Evaluation

In [23]:
#Not sure if we actually need to predict to evaluate the model. 
#Can send in the matrics into compile, and evaluate will do the rest. (recall_m etc.)
loss, accuracy, f1_score, precision, recall = model.evaluate([claim_test, evidence_test], y_test, verbose=0)
print(loss)
print(accuracy)

print(f1_score)

print(precision)

print(recall)



0.23449048399925232
0.7180414795875549
0.7391195297241211
0.6799384951591492
0.8249300718307495


### Multi-input classification evaluation

#### Defining plotting functions

In [None]:
def plot_accuracy(model_callback):
    plt.plot(model_callback.history['acc'])
    plt.plot(model_callback.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc="lower right")
    plt.show()
    
def plot_loss(model_callback):
    plt.plot(model_callback.history['loss'])
    plt.plot(model_callback.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc="lower right")
    plt.show()