## Download dataset

In [15]:
import os
import pandas as pd
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pickle
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from collections import OrderedDict
from keras.preprocessing.text import Tokenizer

In [16]:
import os
import requests
import zipfile

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

# Uncomment if you neewd to dowload the dataset
#download_data('dataset')

# Tokenize data

In [19]:
# Remove this
df = pd.DataFrame([{
            "claim_id": 1,
            "claim": ["hello", "hi", "super"],
            "evidence": ["tagg", "same"]
        }])

In [20]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(df["claim"])
tokenizer.fit_on_texts(df["evidence"])


In [21]:
tokenizer.word_index

{'hello': 1, 'hi': 2, 'super': 3, 'tagg': 4, 'same': 5}

In [9]:
claim_train = tokenizer.texts_to_sequences(df["claim"])
evidence_train = tokenizer.texts_to_sequences(df["evidence"])