# **Fact checking, Neural Languange Inference (NLI)**

**Authors**: Giacomo Berselli, Marco Cucè, Riccardo De Matteo

In [None]:
# to print all output for a cell instead of only last one 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### 1. Libraries and Imports 

In [None]:
import os
import requests
import zipfile
import random
import string 

import torch

import numpy as np
import pandas as pd

import gensim
import gensim.downloader as gloader

import time 

from collections import OrderedDict, namedtuple

# Fix data seed to achieve reproducible results
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
print("Current work directory: {}".format(os.getcwd())) #print the current working directory 

data_folder = os.path.join(os.getcwd(),"data") # directory containing the notebook

if not os.path.exists(data_folder):   #create folder where all data will be stored 
    os.makedirs(data_folder)

### 2. Data handling

First thing first, we download the raw dataset, unzip it and store the csv document of each split in the dataset folder.

In [None]:
raw_dataset_path = os.path.join(data_folder,'raw_dataset')   #path of the raw dataset as downloaded 

def save_response_content(response, destination):    
    CHUNK_SIZE =32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks                
                f.write(chunk)

def download_data(data_folder):
    zip_dataset_path = os.path.join(raw_dataset_path,'fever_data.zip')    
    data_url_id ="1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"    
    url ="https://docs.google.com/uc?export=download"

    if not os.path.exists(raw_dataset_path):        
        os.makedirs(raw_dataset_path)

    if not os.path.exists(zip_dataset_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:           
            response = current_session.get(url, params={'id': data_url_id}, stream=True)

        save_response_content(response, zip_dataset_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(zip_dataset_path) as loaded_zip:            
            loaded_zip.extractall(raw_dataset_path)
        print("Extraction completed!")

download_data(data_folder)

Now that we have the csv files of the train, val and test splits, we encode all three as a unique pandas Dataframe to be able to better inspect it and manipulate it as a whole.
The Dataframe `df` is structured as follows: 
- `claim`: the fact to verify 
- `evidence`: one of the possibly multiple sentences in the dataset which supports or refutes the `claim`
- `id`: number associated to the fact to verify (different rows can have the same `id`)
- `label`: wether the evidence REFUTES or SUPPORTS the claim
- `split`: the split to which one claim belongs (train, val, test)


In [None]:
#encode the entire dataset in a pandas dataframe and add the split column
def encode_dataset(): 

    df = pd.DataFrame()
    for split in ['train','val','test']:
        split_path = os.path.join(raw_dataset_path,f"{split}_pairs.csv")
        split_df = pd.read_csv(split_path,index_col=0)
        split_df['split'] = split

        df = df.append(split_df,ignore_index=True,)

    df.columns = df.columns.str.lower()
    df = df.reset_index(drop=True)

    return df 

df = encode_dataset()

Let's inspect the newly created dataset 

In [None]:
df.head()
print('The splits present in the dataframe are:',df['split'].unique())
print('Unique labels in the dataset:',df['label'].unique())

From the above results we can see that the dataset has been structured correctly.\
Now we print some values to check the dimensions of the different splits and to retrive useful informations.

In [None]:
print('Dataframe shape:', df.shape)
print('Number of example in train:',len(df[df['split']=='train']))
print('Number of example in val:',len(df[df['split']=='val']))
print('Number of example in test:',len(df[df['split']=='test']))

The number of claims in the training split of the dataset is clearly much higher than that of val and test splits.

The dataset should probably undergo some preprocessing before it can be used to train our model. Even if this was already noticeable from the few examples taken from the dataframe that we printed above, let's now show an examples of an evidence to make more evident the work that we will need to do.

In [None]:
print(list(df.sample(1)['evidence']))

### 3. Text preprocessing 

BOH FORSE QUI TROVARE UN MODO PER VEDERE COSA ANDREBBE PULITO DAL DATASET 

Both claims and evidence contain a lot of unwanted text: punctuation, symbols, meta-characters, foreign words, tags, ecc. For some reason claims are much cleaner that evidences. Nonetheless we will preprocess both of them, to end up with a more manageable and digestible text. Especially since all the unwanted text do not contribute to the general meaning of each sentence, which is what we are interested in.
Our preprocessing pipeling will:
- drop everything before the first '\t' (every evidence seems to start with a number followed by '\t')
- delete all unnecessary spaces; only one space between each word will be left `QUESTO COMPRENDE \n \t \s ?` 
- remove all tabs and newlines characters (there are many '\t' in the dataset)  `???? MI SA CHE NON SERVE`
- remove the rounded parenthesis (-LRB- and -RRB-)
- drop words inside square brackets (everything that falls between -LSB- and -RSB-)
- delete all words that contains non-english/non-numerical characters  (there are some greek letters for instance)
- remove 's `E COSE SIMILI DA DEFINIRE O MAGARI NO`
- drop everything after the last dot character (after that there are often some other words similar to tags which may be image descriptions or hyperlinks)
- remove punctuation
- set everything to lowercase
- convert string in list of words 

`CAMBIARE L'ORDINE e AGGIUNGERNE ALTRE (la pipeline che c'è adesso è solo per andare avanti, possibili altre cose con nltk tipo stopwords, stemming, ecc `

In [None]:
import re 
# import unidecode
# import unicodedata2
# import nltk
# nltk.download('stopwords')
# from nltk.stem import WordNetLemmatizer, PorterStemmer
# from nltk.corpus import stopwords


# def lemmatize_and_remove_non_ascii(sentence:str):
#     """Remove unnecessary spaces, remove words with non ASCII characters and lemmatize"""
#     sentence = sentence.split() #remove all unnecessary spaces and return a list of words
#     lemmatizer = WordNetLemmatizer()
#     sentence = [lemmatizer.lemmatize(word) for word in sentence if word.isascii()] #if a word has all ASCII characters: lemmatize, else: remove
#     return sentence

# def stemm_and_remove_non_ascii(sentence: str):
#     sentence = sentence.split() #remove all unnecessary spaces and return a list of words
#     ps = PorterStemmer()
#     sentence = [ps.stem(word) for word in sentence if word.isascii()]#if a word has all ASCII characters: stemm, else: remove
#     return sentence

# def remove_accents(text):
#     """Replace accentuated characters by their non-accentuated counterparts"""
#     text = unicodedata2.normalize('NFKD', text)
#     return "".join([c for c in text if not unicodedata2.combining(c)])

def preprocess_pipeline(sentence:str):
    """Apply standard preprocessing"""
    
    #drop everything before the first '\t' 
    sentence = sentence[sentence.find('\t')+1:]

    #drop everything after the last period
    period_idx = sentence.rfind('.')
    if period_idx!= -1:
        sentence = sentence[:period_idx]

    #remove all rounded parenthesis 
    sentence = sentence.replace('-LRB-','').replace('-RRB-','')

    #remove words inside square brackets
    sentence = re.sub("-LSB.*?-RSB-","",sentence)

    #remove all square brackets
    sentence = sentence.replace('-LSB-','').replace('-RSB-','')

    #remove all punctuation
    sentence = sentence.translate(str.maketrans(dict.fromkeys(string.punctuation,' ')))

    #subsitute the character ˈ with a space 
    sentence = sentence.replace('ˈ',' ')

    #put everything to lowercase
    sentence = sentence.lower()

    return sentence

def preprocess_type1(sentence:str):
    """Apply standard preprocessing and return a list of words"""

    sentence = preprocess_pipeline(sentence)

    #remove all unnecessary spaces and return a list of words
    sentence = sentence.split()

    return sentence

# def preprocess_type2(sentence:str):
#     """Apply standard preprocessing, remove accents, remove words 
#     with non ASCII characters, lemmatize and return a list of words"""

#     sentence = preprocess_pipeline(sentence)

#     #replace accentuated characters by their non-accentuated counterparts
#     sentence = remove_accents(sentence)

#     #remove non-ascii words
#     sentence = lemmatize_and_remove_non_ascii(sentence)

#     return sentence

# def preprocess_type3(sentence:str):
#     """Apply standard preprocessing, transliterates UNICODE characters in ASCII, 
#     remove words with non ASCII characters, lemmatize and return a list of words"""

#     sentence = preprocess_pipeline(sentence)

#     #transliterates any UNICODE string into the closest possible representation in ASCII text
#     sentence = unidecode.unidecode(sentence)

#     #remove non-ascii words
#     sentence = lemmatize_and_remove_non_ascii(sentence)

#     return sentence

# def preprocess_type4(sentence: str):
#     """
#         Apply standard preprocessing, removes stop-words and non ascii's,  and lemmatizes.
#     """

#     sentence = preprocess_pipeline(sentence)
#     sentence = unidecode.unidecode(sentence)
#     lemmatized = lemmatize_and_remove_non_ascii(sentence)
#     stop_words = set(stopwords.words('english'))
#     filter_stop_words = [word for word in lemmatized if not word in stop_words]
#     return filter_stop_words

# def preprocess_type5(sentence: str):
#     """
#         Apply standard preprocessing, removes non ascii's and stemmes.
#     """
#     sentence = preprocess_pipeline(sentence)
#     sentence = unidecode.unidecode(sentence)
#     stemmed = stemm_and_remove_non_ascii(sentence)
#     return stemmed

# def preprocess_type6(sentence: str):
#     """
#         Apply standard preprocessing, removes stop-words and non ascii's,  and stemmes.
#     """
#     sentence = preprocess_pipeline(sentence)
#     sentence = unidecode.unidecode(sentence)
#     stemmed = stemm_and_remove_non_ascii(sentence)
#     stop_words = set(stopwords.words('english'))
#     filter_stop_words = [word for word in stemmed if not word in stop_words]
#     return filter_stop_words




To test our preprocessing pipeline we will apply it to an example in the dataset that we have identified to be a pretty tough one in terms of amount of cleanup necessary.

In [None]:
#retrive from the dataset the 13th example. It is one about Greece in which the text is pretty messy 
original_claim = df.loc[0,'claim']
original_evidence = df.loc[0,'evidence']

processed_claim = preprocess_type1(original_claim)
processed_evidence = preprocess_type1(original_evidence)

print('Original claim:',original_claim)
print('Processed claim:',processed_claim,'\n')
print('Original evidence:',original_evidence)
print('Processed evidence:',processed_evidence,'\n')


As we can see, the final results relative to both the claim and evidence after the preprocessing are satisfacory. For this reason we are now going to apply the preprocessing function to the entire dataset encoded as a Dataframe.

In [None]:
df['claim'] = df['claim'].apply(preprocess_type1)
df['evidence'] = df['evidence'].apply(preprocess_type1)

df.head(10)

### 4. Vocabulary

Next, we have to build the dictionaries that will be used for the numerical tokenization of the dataset and for the generation of the embedding matrix.

The function `build_vocab` takes in input the list of unique words in the whole dataset and creates:
- `word2int`: dictionary which associates each word with an integer.
- `int2word`: dictionary which associates each integer with the relative word.

These two dictionaries constitute a bijective mapping between words and indexes in the dataset.

In [None]:
Vocab = namedtuple('Vocabulary',['word2int','int2word','unique_words'])

def build_vocab(unique_words : list[str]): 
    """
        Builds the dictionaries word2int, int2word and put them in the Vocabulary
    """
    word2int = OrderedDict()
    int2word = OrderedDict()

    for i, word in enumerate(unique_words):
        word2int[word] = i+1           #plus 1 since the 0 will be used as tag token 
        int2word[i+1] = word
    
    return Vocab(word2int,int2word,unique_words)

The function `build_vocab` needs in input the list of all the unique words in the dataset, so we're now going to retrive it from the dataset to be able to build the dictionaries. 

In [None]:
unique_words_claim = df['claim'].explode().unique().tolist()  
unique_words_evidence = df['evidence'].explode().unique().tolist()

print('the number of unique words belonging to claims is:', len(unique_words_claim))
print('the number of unique words belonging to evidences is:', len(unique_words_evidence))

unique_words = set(unique_words_evidence + unique_words_claim)
print('the number of unique words in the entire dataset is:', len(unique_words))


In [None]:
vocab = build_vocab(unique_words)

Now that we have the vocabulary which contains the mapping between word and index we can 'numberise' the dataset. In particular we will add to the Dataframe 3 columns:
- `idx_claim`: same as `claim` but with each word substituted by its index.
- `idx_evidence`: same as `evidence` but with each word substituted by its index.
- `idx_label`: label encoding as a unique integer (0 or 1 in this case)

In [None]:
def build_indexed_dataframe(df: pd.DataFrame):

    df['idx_claim'] = df.claim.apply(lambda x:list(map(vocab.word2int.get,x)))
    df['idx_evidence'] = df.evidence.apply(lambda x:list(map(vocab.word2int.get,x)))

    df['label'] = df.label.astype('category')   #convert the label column into category dtype
    df['idx_label'] = df.label.cat.codes        #assign unique integer to each category

    return df 

def check_dataframe_numberization(df,vocab):

    """
       Checks if the numberized dataframe will lead to the normal dataframe usind the reverse mapping 
    """

    claims = df['claim']
    evidences = df['evidence']

    idx_to_claims = df.idx_claim.apply(lambda x:list(map(vocab.int2word.get,x)))
    idx_to_evidences = df.idx_evidence.apply(lambda x:list(map(vocab.int2word.get,x)))

    if claims.equals(idx_to_claims) and evidences.equals(idx_to_evidences):
        print('All right with dataset numberization')
    else:
        raise Exception('There are problems with Dataset numberization')

df = build_indexed_dataframe(df)

check_dataframe_numberization(df,vocab)

Since the operation was successful, let's have a look at the numebrized dataframe.

In [None]:
df.head()

### 5. Data Loaders 

In order generate mini-batches for each split to be passed to the network we leveraged a `torchtext` utility, such as `BucketIterator`. It ensures that each mini-batch is composed of sequences of nearly the same length (depending on the chosen batch size), in order to add the minimum padding possible to each Tensor. In order to do so, we needed to create a Pytorch Dataset since this is what is requested by the BucketIterator.\
The problem is now how to define the length of the input to the model (which is used to create buckets of similar-lenghts senquences), since for this task we are dealing with multiple inputs (claim and evidence). 


In [None]:
claim_len = df.claim.apply(len)
evidence_len = df.evidence.apply(len)
print('average length of a claim sentence:',claim_len.mean())
print('average length of a evidence sentence:',evidence_len.mean())
print('max difference in length of claim sentences:',claim_len.max() - claim_len.min())
print('max difference in length of evidence sentences:',evidence_len.max() - evidence_len.min())

Based on the fact that the average sentence length for an evidence is much bigger than for a claim, we decided to create buckets based on the length of the evidence and only with that being equal, based on the claim's length. So the minibatches will be constructed by grouping similar-size evidences and their corresponing claims.  

In [None]:
from torchtext.legacy.data import BucketIterator
from torch.utils.data import Dataset

class DataframeDataset(Dataset):

    def __init__(self, dataframe: pd.DataFrame):
        self.claims = dataframe['idx_claim']      #column of numberized claims 
        self.evidences = dataframe['idx_evidence']   #column of numberized evidences 
        self.labels = dataframe['idx_label']       #column of categorical label 
        self.claim_ids = dataframe['id']          #column of claim ids 

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        return {'claim': self.claims[idx],
                'evidence': self.evidences[idx],
                'label': self.labels[idx],
                'claim_id': self.claim_ids[idx]}

def create_dataloaders(b_s : int, dataframe: pd.DataFrame):     #b_s = batch_size
    
    train_df = dataframe[dataframe['split'] == 'train'].reset_index(drop=True)      
    val_df = dataframe[dataframe['split'] == 'val'].reset_index(drop=True)
    test_df = dataframe[dataframe['split'] == 'test'].reset_index(drop=True)

    #create DataframeDataset objects for each split 
    train_dataset = DataframeDataset(train_df)
    val_dataset = DataframeDataset(val_df)
    test_dataset = DataframeDataset(test_df)


    # Group similar length text sequences together in batches and return an iterator for each split.
    train_dataloader,val_dataloader,test_dataloader = BucketIterator.splits((train_dataset,val_dataset,test_dataset),
                                                        batch_sizes=(b_s,b_s,b_s), sort_key=lambda x: (len(x['evidence']),len(x['claim'])), 
                                                        repeat=True, sort=False, shuffle=True, sort_within_batch=True)
    
    return train_dataloader,val_dataloader,test_dataloader 


### 6. Word embeddings

We can finally build an embedding matrix that will be used by the embedding layer of our model to store pre-trained word embeddings and retrive them using indices. 
The function `build_embedding_matrix`, via the passed embedding model and the `word2int` dictionary, costructs a matrix that stores at each word-index the corresponding embedding vector found in GloVe. In particular we decided to use Glove as embedding model with a vector dimension of 300. 
 
In order to handle OOV words:
- If a word in the dataset (identified by its unique integer) is present in GloVe model, we store its embedding vector in the embedding matrix.
- Otherwise we assign as embedding to the OOV word a random vector of size 300, sampled from a uniform distribution.

First thing first, we need to download the `GloVe model` from gensim.

In [None]:
emb_matrix_path = os.path.join(data_folder, "emb_matrix.npy")

def download_glove_emb(force_download = False):   
    """
        Download the glove embedding model and returns it 
    """
    emb_model = None

    if os.path.exists(emb_matrix_path) and not force_download: 
        print('embedding matrix already saved in data folder')

    else:
        print('downloading glove embeddings ')        
        embedding_dimension=300

        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
        emb_model = gloader.load(download_path)
        
    return emb_model

force_download = False      # to download glove model even if the emb_matrix has been already create. Mainly for testing purposes

glove_embeddings = download_glove_emb(force_download)

Now that we have the glove embeddings, we can check if there are some Out Of Vocabulary (OOV) words in our processed dataset.
\
A word is considered OOV if it is present in our dataset but not in the GloVe embeddings. 

In [None]:
def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors, vocab):
    """
        Given the embedding model and the unique words in the dataframe, determines the out-of-vocabulary words 
    """
    oov_words = []
    idx_oov_words = []

    if embedding_model is None:
        print('WARNING: empty model, remember to download GloVe first or set force_dowload to True')

    else: 
        for word in vocab.unique_words:
            try: 
                embedding_model[word]
            except:
                oov_words.append(word) 
                idx_oov_words.append(vocab.word2int[word]) 
        
        print("Total number of unique words in dataset:",len(vocab.unique_words))
        print("Total OOV terms: {0} ({1:.2f}%)".format(len(oov_words), (float(len(oov_words)) / len(vocab.unique_words))*100))
        print("Some OOV terms:",random.sample(oov_words,15))
    
    return oov_words, idx_oov_words

oov_words, idx_oov_words = check_OOV_terms(glove_embeddings,vocab)

The processed dataset contains a total number of 35096 unique words. By using the GloVe embeddings for our embedding matrix, we obtain 3745 OOV words, that is the 8.97% of all different words in our dataset.

Let's build the embedding matrix then.

In [None]:
def build_embedding_matrix(emb_model: gensim.models.keyedvectors.KeyedVectors,vocab) -> np.ndarray:
    """
        If the embedding for the word is present, add it to the embedding_matrix, otherwise insert a vector of random values.
        Return the embedding matrix
    """
    if emb_model is None:
        print('WARNING: empty model, remember to download GloVe first or set force_dowload to True')
        return None

    embedding_dimension = len(emb_model[0]) #how many numbers each emb vector is composed of                                                           
    embedding_matrix = np.zeros((len(vocab.word2int)+1, embedding_dimension), dtype=np.float32)   #create a matrix initialized with all zeros 

    for word, idx in vocab.word2int.items():
        try:
            embedding_vector = emb_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector     #assign the retrived or the generated vector to the corresponding index 
    
    print('Saving embedding matrix')
    np.save(emb_matrix_path,embedding_matrix,allow_pickle=True)
    print("Embedding matrix shape: {}".format(embedding_matrix.shape))

    return embedding_matrix

embedding_matrix = build_embedding_matrix(glove_embeddings, vocab)

Let's have a look at the first few rows of the freshly created embedding matrix, to get a sense of it.

In [None]:
pd.DataFrame(embedding_matrix).head()

As we can see the very first row is full of zeros since that's a 'fake embedding' for the padding token which will never be used in practice.

To be completely sure that the embedding matrix has been built correctly, we check that the embedding vector associated with an index in the embedding matrix is the same as the one retrieved from glove by passing to it the word to which that index correspond. 

In [None]:
def check_id_corr(glove: gensim.models.keyedvectors.KeyedVectors, vocab, matrix, dataframe):
    """
        Checks whether the numberized dataframe and the index of the embedding matrix correspond
    """
    if not glove:
        print('WARNING: empty model, remember to download GloVe first or set force_dowload to True')
        return 
    oov_words_ = []

    for indexed_sentence in dataframe['idx_claim']+dataframe['idx_evidence']:

        for token in indexed_sentence:
            embedding = matrix[token]
            word = vocab.int2word[token]
            if word in glove.key_to_index:
                assert(np.array_equal(embedding,glove[word]))
            else:
                oov_words_.append(word)

    print('Double check OOV number:',len(set(oov_words_)))

check_id_corr(glove_embeddings,vocab,embedding_matrix,df)

Since no error has been found, we can safely proceed with the next steps.

To avoid downloading the GloVe embeddings more than once, since the process is really slow, in the case that this is not the first run and the embedding matrix has been already created and saved we can load it from the data folder

In [None]:
def load_data():
    """
        Loads the saved emb_matrix from the data folder 
    """
    print('Loading embedding matrix')
    emb_matrix = np.load(emb_matrix_path,allow_pickle=True)
    print('Loaded')

    return emb_matrix

if os.path.exists(emb_matrix_path) :
    embedding_matrix = load_data()
else:
    print('What you are looking for is not present in the folder')

### 7. Model designing 

SCRIVERE ROBA DI COMMENTO OPPURE ANCHE NO

In [None]:
#pytoch imports

import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

#scikit-learn imports 
from sklearn.metrics import f1_score


In [None]:
Architecture = namedtuple('Architecture',['sentence_emb_strat','merge_input','cosine_sim'])

In [None]:
class custom_model(nn.Module):
    """
        Class defining our model architecture  
    """
    def __init__(self, emb_matrix: np.ndarray, pad_idx: int, max_tokens: int, architecture_param : Architecture) :
        super().__init__()

        self.embedding_layer, self.word_embedding_dim = self.build_emb_layer(emb_matrix,pad_idx)

        self.max_tokens = max_tokens
        self.pad_idx = pad_idx 

        self.param = architecture_param

        if self.param.sentence_emb_strat == 'mlp':
            self.mlp = nn.Linear(max_tokens,1)

        elif self.param.sentence_emb_strat in ('rnn_last','rnn_avg'):
            self.rnn = nn.LSTM(self.word_embedding_dim, self.word_embedding_dim, batch_first = True) 


    
    def build_emb_layer(self, weights_matrix: np.ndarray, pad_idx : int, train: bool):
    
        matrix = torch.Tensor(weights_matrix)   #the embedding matrix 
        _ , embedding_dim = matrix.shape 
        emb_layer = nn.Embedding.from_pretrained(matrix, freeze=train, padding_idx = pad_idx)   #load pretrained weights in the layer and make it non-trainable 
        
        return emb_layer, embedding_dim
        

    def pad_batch(self,batch: list):
        """
            Input:  List of Tensors of variable lenght
            Output: Batch of tensors all padded to the same length 
        """
        #if we are going to use an mlp as sentence embedding strategy, all the sentences should be padded to max_tokens length
        if self.param.sentence_emb_strat == 'mlp':
            batch[0] = nn.ConstantPad1d((0,self.max_tokens-batch[0].shape[0]),0)(batch[0])  

        padded_batch = rnn.pad_sequence(batch,batch_first = True, padding_value = self.pad_idx)

        return padded_batch


    def words_embedding(self, word_idxs):
        """
            Input:  [batch_size, num_tokens]
            Output: [batch_size, num_tokens, embedding_dim]
        """
        return self.embedding_layer(word_idxs)
    
    def sentence_embedding(self, embeddings, sentence_lenghts):
        """
            Input:  [batch_size, num_tokens, embedding_dim]
            Output: [batch_size, embedding_dim]
        """

        strat = self.param.sentence_emb_strat

        def mlp():
            
            reshaped_embeddings = embeddings.permute(0,2,1)     #swap last two dimensions since Linear operates only on last dimension

            sentence_emb = self.mlp(reshaped_embeddings)   

            return sentence_emb.squeeze(2)   #remove dimension of size 1 
        
        def rnn_last():
            
            packed_embeddings = pack_padded_sequence(embeddings, sentence_lenghts, batch_first=True, enforce_sorted=False)
            packed_out, (last_h, _)  = self.rnn(packed_embeddings)

            return last_h.squeeze(0)  #remove first dimension of 1 (TODO: if bidirectional or more than 1 layer this has to be handled)
        
        def rnn_avg():

            packed_embeddings = pack_padded_sequence(embeddings, sentence_lenghts, batch_first=True, enforce_sorted=False)
            packed_out, _  = self.rnn(packed_embeddings)

            unpacked_out, l = pad_packed_sequence(packed_out,batch_first=True)

            avg = unpacked_out.sum(dim=1).div(unpacked_out.count_nonzero(dim=1))

            return avg
        
        def bag_of_vectors():

            avg = embeddings.sum(dim=1).div(embeddings.count_nonzero(dim=1))

            return avg 

        if strat == 'mlp':
            return mlp()
        elif strat == 'rnn_last':
            return rnn_last()
        elif strat == 'rnn_avg':
            return rnn_avg()
        elif strat == 'bag_of_vectors':
            return bag_of_vectors()
        else :
            raise Exception('Incorrect name for sentence embedding strategy')

    def merge_sentence_emb(claims,evidences):

        return 





    def forward(self, claims, claim_lengths, evidences, evidence_lengths):

        #pad the sentences to have fixed size 
        padded_claims = self.pad_batch(claims)
        padded_evidences = self.pad_batch(evidences)
        
        #embed each word in a sentence with a 300d vector 
        word_emb_claims = self.words_embedding(padded_claims)          
        word_emb_evidences = self.words_embedding(padded_evidences)

        #compute sentence embedding
        sentence_emb_claims = self.sentence_embedding(word_emb_claims)
        sentence_emb_evidences = self.sentence_embedding(word_emb_evidences)

        #merge multi-inputs 
        classification_input = self.merge_sentence_emb(sentence_emb_claims,sentence_emb_evidences)

        return 

        

        
        

In [None]:
if 'ciao' in ('cia','miao'):
    print('a')

In [None]:
max_tokens = max(df.claim.apply(len).max(),df.evidence.apply(len).max())