## Download dataset

In [2]:
import os
import pandas as pd
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pickle
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from collections import OrderedDict
from keras.preprocessing.text import Tokenizer

In [3]:
import os
import requests
import zipfile

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

# Uncomment if you neewd to dowload the dataset
#download_data('dataset')

# Preprocessing data


In [4]:
#create a dataframe for the training data 
train_df = pd.read_csv('./dataset/train_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})
train_df

#create a dataframe for the validation data 
val_df = pd.read_csv('./dataset/val_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})
val_df

#create a dataframe for the test data 
test_df = pd.read_csv('./dataset/test_pairs.csv', skipinitialspace=True, usecols=["Claim","Evidence","ID","Label"]).rename(columns={"ID": "claimID"})
test_df


Unnamed: 0,Claim,Evidence,claimID,Label
0,Anxiety has been linked with physical symptoms.,"13\tFurthermore , anxiety has been linked with...",16387,SUPPORTS
1,Firefox is an application.,0\tMozilla Firefox -LRB- or simply Firefox -RR...,6,SUPPORTS
2,Keegan-Michael Key played President Barack Oba...,"6\tIn 2015 , Key appeared at the White House C...",16392,SUPPORTS
3,Google Search can find stock quotes.,"13\tThese include synonyms , weather forecasts...",16394,SUPPORTS
4,A Good Day to Die Hard was directed solely by ...,1\tThe film was directed by John Moore and wri...,98315,REFUTES
...,...,...,...,...
7184,Scandal is an American band.,0\tScandal is an American rock band from the 1...,16378,SUPPORTS
7185,Henry Cavill played Superman.,8\tCavill gained further prominence and intern...,143046,SUPPORTS
7186,The Africa Cup of Nations is a friendly global...,"0\tThe Africa Cup of Nations , officially CAN ...",16382,REFUTES
7187,Ron Dennis is the owner of a catering company ...,"0\tAbsolute Taste , is a London-based catering...",147455,SUPPORTS


# Tokenize data

In [6]:
# The tokenizer will have an index 1 for OOV words. A lot of words in test and val will be 1.
tokenizer = Tokenizer(oov_token=1)

tokenizer.fit_on_texts(train_df["Claim"])
tokenizer.fit_on_texts(train_df["Evidence"])

In [9]:
claim_train = tokenizer.texts_to_sequences(train_df["Claim"])
evidence_train = tokenizer.texts_to_sequences(train_df["Evidence"])

claim_val = tokenizer.texts_to_sequences(val_df["Claim"])
evidence_val = tokenizer.texts_to_sequences(val_df["Evidence"])

claim_test = tokenizer.texts_to_sequences(test_df["Claim"])
evidence_test = tokenizer.texts_to_sequences(test_df["Evidence"])

### Padding

In [10]:
def pad_idx_seqs(idx_seqs, max_seq_len):
    # Keras provides a convenient padding function; 
    padded_idxs = pad_sequences(sequences=idx_seqs, maxlen=max_seq_len)
    return padded_idxs

In [11]:
def find_max_length(seqs):
    max_len = 0
    for seq in seqs:
        if len(seq) > max_len:
            max_len = len(seq)
    return max_len

In [25]:
MAX_SEQ_LEN = max(find_max_length(claim_train), find_max_length(claim_val), find_max_length(claim_test))

In [26]:
claim_padded_train = pad_idx_seqs(claim_train, MAX_SEQ_LEN)
evidence_padded_train = pad_idx_seqs(evidence_train, MAX_SEQ_LEN)

claim_padded_val = pad_idx_seqs(claim_val, MAX_SEQ_LEN)
evidence_padded_val = pad_idx_seqs(evidence_val, MAX_SEQ_LEN)

claim_padded_test = pad_idx_seqs(claim_test, MAX_SEQ_LEN)
evidence_padded_test = pad_idx_seqs(evidence_test, MAX_SEQ_LEN)

In [27]:
claim_padded_val.shape

(7165, 65)