In [1]:
import os
import requests
import zipfile

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

download_data('dataset')

Downloading FEVER data splits...
Download completed!
Extracting dataset...
Extraction completed!


## Dataset pre-processing

### Dataset loading and inspection

In [1]:
import pandas as pd
import numpy as np

# wider pandas columns
pd.options.display.max_colwidth = 1000

train_df = pd.read_csv("dataset/train_pairs.csv")
val_df   = pd.read_csv("dataset/val_pairs.csv")
test_df  = pd.read_csv("dataset/test_pairs.csv")

In [2]:
# inspect dataset
train_df.head(-1)

Unnamed: 0.1,Unnamed: 0,Claim,Evidence,ID,Label
0,0,Chris Hemsworth appeared in A Perfect Getaway.,"2\tHemsworth has also appeared in the science fiction action film Star Trek -LRB- 2009 -RRB- , the thriller adventure A Perfect Getaway -LRB- 2009 -RRB- , the horror comedy The Cabin in the Woods -LRB- 2012 -RRB- , the dark-fantasy action film Snow White and the Huntsman -LRB- 2012 -RRB- , the war film Red Dawn -LRB- 2012 -RRB- , and the biographical sports drama film Rush -LRB- 2013 -RRB- .\tStar Trek\tStar Trek (film)\tA Perfect Getaway\tA Perfect Getaway\tThe Cabin in the Woods\tThe Cabin in the Woods\tSnow White and the Huntsman\tSnow White and the Huntsman\tRed Dawn\tRed Dawn (2012 film)\tRush\tRush (2013 film)",3,SUPPORTS
1,1,Roald Dahl is a writer.,"0\tRoald Dahl -LRB- -LSB- langpronˈroʊ.əld _ ˈdɑːl -RSB- , -LSB- ˈɾuːɑl dɑl -RSB- ; 13 September 1916 -- 23 November 1990 -RRB- was a British novelist , short story writer , poet , screenwriter , and fighter pilot .\tfighter pilot\tfighter pilot",7,SUPPORTS
2,2,Roald Dahl is a governor.,"0\tRoald Dahl -LRB- -LSB- langpronˈroʊ.əld _ ˈdɑːl -RSB- , -LSB- ˈɾuːɑl dɑl -RSB- ; 13 September 1916 -- 23 November 1990 -RRB- was a British novelist , short story writer , poet , screenwriter , and fighter pilot .\tfighter pilot\tfighter pilot",8,REFUTES
3,3,Ireland has relatively low-lying mountains.,"10\tThe island 's geography comprises relatively low-lying mountains surrounding a central plain , with several navigable rivers extending inland .\tisland\tisland\tgeography\tgeography\tseveral navigable rivers\tRivers of Ireland",9,SUPPORTS
4,4,Ireland does not have relatively low-lying mountains.,"10\tThe island 's geography comprises relatively low-lying mountains surrounding a central plain , with several navigable rivers extending inland .\tisland\tisland\tgeography\tgeography\tseveral navigable rivers\tRivers of Ireland",10,REFUTES
...,...,...,...,...,...
121734,121734,Anderson Silva is a former UFC heavyweight Champion.,"0\tAnderson da Silva -LRB- -LSB- ˈɐ̃deʁsõ ˈsiwvɐ -RSB- ; born April 14 , 1975 -RRB- is a Brazilian mixed martial artist and former UFC Middleweight Champion .\tMiddleweight\tMiddleweight (MMA)\tmixed martial artist\tmixed martial arts\tUFC Middleweight Champion\tUFC Middleweight Championship",229439,REFUTES
121735,121735,April was the month Anderson Silva was born.,"0\tAnderson da Silva -LRB- -LSB- ˈɐ̃deʁsõ ˈsiwvɐ -RSB- ; born April 14 , 1975 -RRB- is a Brazilian mixed martial artist and former UFC Middleweight Champion .\tMiddleweight\tMiddleweight (MMA)\tmixed martial artist\tmixed martial arts\tUFC Middleweight Champion\tUFC Middleweight Championship",229440,SUPPORTS
121736,121736,Anderson Silva is an American Brazilian mixed martial artist.,"0\tAnderson da Silva -LRB- -LSB- ˈɐ̃deʁsõ ˈsiwvɐ -RSB- ; born April 14 , 1975 -RRB- is a Brazilian mixed martial artist and former UFC Middleweight Champion .\tMiddleweight\tMiddleweight (MMA)\tmixed martial artist\tmixed martial arts\tUFC Middleweight Champion\tUFC Middleweight Championship",229443,REFUTES
121737,121737,Anderson Silva is incapable of being a Brazilian mixed martial artist.,"0\tAnderson da Silva -LRB- -LSB- ˈɐ̃deʁsõ ˈsiwvɐ -RSB- ; born April 14 , 1975 -RRB- is a Brazilian mixed martial artist and former UFC Middleweight Champion .\tMiddleweight\tMiddleweight (MMA)\tmixed martial artist\tmixed martial arts\tUFC Middleweight Champion\tUFC Middleweight Championship",229444,REFUTES


### Preprocessing

In [3]:
# examine evidences
print(train_df["Evidence"][0])
print(train_df["Evidence"][993])

2	Hemsworth has also appeared in the science fiction action film Star Trek -LRB- 2009 -RRB- , the thriller adventure A Perfect Getaway -LRB- 2009 -RRB- , the horror comedy The Cabin in the Woods -LRB- 2012 -RRB- , the dark-fantasy action film Snow White and the Huntsman -LRB- 2012 -RRB- , the war film Red Dawn -LRB- 2012 -RRB- , and the biographical sports drama film Rush -LRB- 2013 -RRB- .	Star Trek	Star Trek (film)	A Perfect Getaway	A Perfect Getaway	The Cabin in the Woods	The Cabin in the Woods	Snow White and the Huntsman	Snow White and the Huntsman	Red Dawn	Red Dawn (2012 film)	Rush	Rush (2013 film)
25	She has appeared in Time 100 most influential people in the world -LRB- 2010 and 2015 -RRB- , Forbes top-earning women in music -LRB- 2011 -- 2015 -RRB- , Forbes 100 most powerful women -LRB- 2015 -RRB- , and Forbes Celebrity 100 -LRB- 2016 -RRB- .	Time	Time (magazine)	100 most influential people in the world	Time 100	Forbes	Forbes	100 most powerful women	The World's 100 Most Powerfu

In [4]:
import re
from functools import reduce
import nltk
from nltk.corpus import stopwords
try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])

def remove_leading_tabs(text) :
  # remove leading tabs
  pattern = r'[0-9]+?\t'
  return re.sub(pattern, '', text)

def remove_pronunciations(text) :
  # remove pronunciations
  pattern = r'-LSB-.*?-RSB-(\s;)*?'
  return re.sub(pattern, '', text)

def convert_round_brackets(text) :
  # convert -LRB- -RRB- to ( or )
  pattern = r'-LRB-'
  text = re.sub(pattern, '(', text)
  pattern = r'-RRB-'
  return re.sub(pattern, ')', text)

def fix_double_dashes(text) :
  # fix: double dashes (--)
  pattern = r'\-\-'
  return re.sub(pattern, '-', text)

def remove_trailing_words(text) :
  # remove trailing words (hyperlinks)
  pattern = r'.\t.*?$'
  return re.sub(pattern, '.', text)

def split_genitive(text) :
  # make sure all possesive 's are split from other words
  pattern = r"(\s.+?)'s"
  return re.sub(pattern, r"\1's", text)

def split_periods(text) :
  pattern = r'(\s.+?)\.'
  return re.sub(pattern, r'\1 .', text)

def fix_days(text) :
  # fix: 31st -> 31 st
  pattern = r'([0-9]{1,2})(st|nd|rd|th)'
  return re.sub(pattern, r'\1', text)

def separate_years(text) :
  # fix: separate years from other words
  pattern = r'(\s.+?)([0-9]{4})'
  return re.sub(pattern, r'\1 \2', text) 

def fix_comma_thousands(text) :
  # fix: comma thousands notations 
  pattern = r'([0-9]{1,3}),([0-9]{1,3})'
  text = re.sub(pattern, r'\1\2', text)
  pattern = r'([0-9]{1,3}),'
  return re.sub(pattern, r'\1', text)

def fix_weird_dash(text) :
  # fix: replace weird dash with normal one
  pattern = r'–'
  return re.sub(pattern, '-', text)

def fix_years_ranges(text) :
  # fix years ranges
  pattern = r'([0-9]{4})\-([0-9]{4})'
  text = re.sub(pattern, r'\1 - \2', text)
  pattern = r'([0-9]{2})([0-9]{2})\-([0-9]{2})'
  text = re.sub(pattern, r'\1\2 - \1\3', text)
  pattern = r'\'([0-9]{2})-\'([0-9]{2})'
  return re.sub(pattern, r'19\1 - 19\2', text)

def fix_number_ranges(text) :
  # fix: numbers ranges
  pattern = r'([0-9]+?[,\.][0-9]+?)+?-([0-9]+?[,\.][0-9]+?)+'
  return re.sub(pattern, r'\1 - \2', text)

def fix_double_tick(text) :
  # fix: double tick
  pattern = r'\`\`'
  return re.sub(pattern, '"', text)

def fix_date_merged(text) :
  # fix: year/day merged with other word
  pattern = r'([0-9]{1,4})([a-zA-Z]+?)'
  return re.sub(pattern, r'\1 \2', text)

def fix_double_ending_periods(text) :
  # fix: double ending periods in claims
  pattern = r'([a-zA-Z]{1,2}\.)\.$'               # except abbreviations (e.g jr. or c. k.)
  text = re.sub(pattern, '\1 .', text)
  pattern = r'\.\.$'
  return re.sub(pattern, '.', text)

def fix_slashes_words(text) :
  # fix: slashes separated from second word/number  e.g. "2006/ 2007"
  pattern = r'(\s.+?)\/\s(.+?\s)'
  text =  re.sub(pattern, r'\1 \/ \2', text)    
  # fix: separate strings between slashes
  pattern = r'(.+?)\/(.+?)'
  return re.sub(pattern, r'\1 \/ \2', text)

def fix_string_ending_dash(text) :
  # fix: fix strings ending with dash
  pattern = r'(.+?)\-\s'
  return re.sub(pattern, r'\1 - ', text)

def separate_non_words(text) :
  # fix: separate words like non-something
  pattern = r'([a-zA-Z]+?)\-([a-zA-Z]+?)'
  return re.sub(pattern, r'\1 - \2', text)
  
def fix_remove_round_brackets(text) :
  # remove between round brackets
  pattern = r'\([^\(\)]+?\)'
  return re.sub(pattern, ' ', text)

def remove_double_spaces(text) :
  # remove double spaces
  pattern = r'(\s)\s+?'
  return re.sub(pattern, r'\1', text)

#test
def replace_special_characters(text):
  REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
  return REPLACE_BY_SPACE_RE.sub(' ', text)

def good_symbols(text) :
  GOOD_SYMBOLS_RE = re.compile('[^0-9a-zA-Z\s\+_]')
  return GOOD_SYMBOLS_RE.sub('', text)

def lower(text) :
  return text.lower()

PREPROCESSING_PIPELINE = [
                          remove_leading_tabs,
                          remove_trailing_words,
                          #remove_stopwords,
                          convert_round_brackets,
                          #fix_remove_round_brackets,
                          remove_pronunciations,
                          fix_double_dashes,
                          split_genitive,
                          split_periods,
                          fix_days,
                          separate_years,
                          fix_comma_thousands,
                          fix_weird_dash,
                          fix_years_ranges,
                          fix_number_ranges,
                          fix_double_tick,
                          fix_date_merged,
                          fix_double_ending_periods,
                          fix_slashes_words,
                          fix_string_ending_dash,
                          separate_non_words,
                          remove_double_spaces,
                          #replace_special_characters,
                          #good_symbols,
                          lower
]


def preprocess_text(text, filter_methods=None):
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """
    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE
    return reduce(lambda txt, f: f(txt), filter_methods, text)

In [5]:
print("Preprocessing dataset...")

train_df["Claim"] = train_df["Claim"].apply(preprocess_text)
train_df["Evidence"] = train_df["Evidence"].apply(preprocess_text)
print("Training data done.")
val_df["Claim"] = val_df["Claim"].apply(preprocess_text)
val_df["Evidence"] = val_df["Evidence"].apply(preprocess_text)
print("Validation data done.")
test_df["Claim"] = test_df["Claim"].apply(preprocess_text)
test_df["Evidence"] = test_df["Evidence"].apply(preprocess_text)
print("Testing data done.")

print("Preprocessing complete!")

Preprocessing dataset...
Training data done.
Validation data done.
Testing data done.
Preprocessing complete!


In [6]:
train_df[train_df['Claim'].str.contains('k\.\.') == True]

Unnamed: 0.1,Unnamed: 0,Claim,Evidence,ID,Label


## Dataset conversion

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
import gensim.downloader as api
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.toktok import ToktokTokenizer

embedding_dimension = 50
glove = api.load(f"glove-wiki-gigaword-{embedding_dimension}")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [8]:
# tokenizer based on penn treebank
def tokenize_df(df) :
  #word_tokenizer = TreebankWordTokenizer()
  word_tokenizer = ToktokTokenizer()
  tmp = pd.DataFrame(columns=["Claim", "Evidence", "Label"])
  for col in ["Claim", "Evidence"]:
    #tmp[col] = df[col].str.lower()
    tmp[col] = df[col].apply(word_tokenizer.tokenize)
    #tmp[col] = np.array([word_tokenizer.tokenize(r) for r in tmp[col]])
  tmp["Label"] = df["Label"]
  return tmp

In [9]:
def get_embedding_matrix(embedding_model, tokenizer) :
  embedding_matrix = []
  emebedding_dimension = embedding_model.vectors[0].shape[0]
  for word in tokenizer.index_word.values() :
    if word in embedding_model:
      embedding_matrix.append(embedding_model[word])
    else:
      embedding_matrix.append(np.random.uniform(low=-1.0, high=1.0, size=(embedding_dimension,)))
  return embedding_matrix

#np.array(get_embedding_matrix(glove, w_tok))

In [10]:
tok_train_df = tokenize_df(train_df)
tok_train_df.head()

Unnamed: 0,Claim,Evidence,Label
0,"[chris, hemsworth, appeared, in, a, perfect, getaway, .]","[hemsworth, has, also, appeared, in, the, science, fiction, action, film, star, trek, (, 2009, ), ,, the, thriller, adventure, a, perfect, getaway, (, 2009, ), ,, the, horror, comedy, the, cabin, in, the, woods, (, 2012, ), ,, the, dark, -, fantasy, action, film, snow, white, and, the, huntsman, (, 2012, ), ,, the, war, film, red, dawn, (, 2012, ), ,, and, the, biographical, sports, drama, film, rush, (, 2013, ), .]",SUPPORTS
1,"[roald, dahl, is, a, writer, .]","[roald, dahl, (, ,, ;, 13, september, 1916, -, 23, november, 1990, ), was, a, british, novelist, ,, short, story, writer, ,, poet, ,, screenwriter, ,, and, fighter, pilot, .]",SUPPORTS
2,"[roald, dahl, is, a, governor, .]","[roald, dahl, (, ,, ;, 13, september, 1916, -, 23, november, 1990, ), was, a, british, novelist, ,, short, story, writer, ,, poet, ,, screenwriter, ,, and, fighter, pilot, .]",REFUTES
3,"[ireland, has, relatively, low, -, lying, mountains, .]","[the, island, ', s, geography, comprises, relatively, low, -, lying, mountains, surrounding, a, central, plain, ,, with, several, navigable, rivers, extending, inland, .]",SUPPORTS
4,"[ireland, does, not, have, relatively, low, -, lying, mountains, .]","[the, island, ', s, geography, comprises, relatively, low, -, lying, mountains, surrounding, a, central, plain, ,, with, several, navigable, rivers, extending, inland, .]",REFUTES


In [11]:
# build vocab
def build_vocab_from_df(df) :
  ret = []
  for col in ["Claim", "Evidence"] :
    for r in df[col] :
      ret += r
  ret = pd.unique(ret)    # much faster than np.unique
  return ret

vocab = list(glove.vocab.keys())               # glove vocab

vocab_v1 = np.array(build_vocab_from_df(tok_train_df), dtype=str)   # add unique terms from train_df
oov_v1 = vocab_v1[~np.in1d(vocab_v1, vocab)] # find OOV terms
vocab = np.concatenate((vocab, oov_v1))   #update vocab

vocab_v2 = np.array(build_vocab_from_df(tokenize_df(val_df)), dtype=str)   # add unique terms from val_df
oov_v2 = vocab_v2[~np.in1d(vocab_v2, vocab)]
vocab = np.concatenate((vocab, oov_v2))

vocab_v3 = np.array(build_vocab_from_df(tokenize_df(test_df)), dtype=str)   # add unique terms from val_df
oov_v3 = vocab_v3[~np.in1d(vocab_v3, vocab)]
vocab = np.concatenate((vocab, oov_v3))

#vocab = list(dict.fromkeys(vocab))
print(f"vocab len: {len(vocab)}")

vocab len: 403506


In [12]:
# build word -> int encoding 
word_to_idx = dict(zip(vocab, range(1, len(vocab)+1))) # start from 1 to reserve 0 to padding

In [13]:
def encode_sent(sent, word_to_idx=word_to_idx) :
  return [word_to_idx[w] for w in sent]

def encode_df(tok_df, word_to_idx) :
  enc_df = pd.DataFrame(columns=["Claim", "Evidence", "Label"])
  for col in ["Claim", "Evidence"] :
    enc_df[col] = tok_df[col].apply(lambda s: encode_sent(s, word_to_idx))
  enc_df["Label"] = tok_df["Label"].apply(lambda x: 1 if x=="SUPPORTS" else 0)
  return enc_df

In [14]:
# encode dataset
enc_train_df = encode_df(tok_train_df, word_to_idx)

In [15]:
enc_train_df.head()

Unnamed: 0,Claim,Evidence,Label
0,"[2103, 107954, 790, 7, 8, 2616, 20647, 3]","[107954, 32, 53, 790, 7, 1, 1122, 3955, 609, 320, 754, 9780, 24, 704, 25, 2, 1, 8966, 6041, 8, 2616, 20647, 24, 704, 25, 2, 1, 5989, 2842, 1, 7741, 7, 1, 2508, 24, 940, 25, 2, 1, 2238, 12, 5848, 609, 320, 2643, 299, 6, 1, 34011, 24, 940, 25, 2, 1, 137, 320, 640, 4650, 24, 940, 25, 2, 6, 1, 18899, 885, 2693, 320, 3993, 24, 1280, 25, 3]",1
1,"[53403, 21758, 15, 8, 1542, 3]","[53403, 21758, 24, 2, 90, 677, 442, 6907, 12, 1022, 488, 1456, 25, 16, 8, 298, 8998, 2, 637, 524, 1542, 2, 4820, 2, 12604, 2, 6, 3511, 2499, 3]",1
2,"[53403, 21758, 15, 8, 1005, 3]","[53403, 21758, 24, 2, 90, 677, 442, 6907, 12, 1022, 488, 1456, 25, 16, 8, 298, 8998, 2, 637, 524, 1542, 2, 4820, 2, 12604, 2, 6, 3511, 2499, 3]",0
3,"[1323, 32, 2224, 654, 12, 4740, 2755, 3]","[1, 584, 58, 1535, 4214, 7817, 2224, 654, 12, 4740, 2755, 2724, 8, 324, 5406, 2, 18, 202, 30455, 4058, 5787, 8331, 3]",1
4,"[1323, 261, 37, 34, 2224, 654, 12, 4740, 2755, 3]","[1, 584, 58, 1535, 4214, 7817, 2224, 654, 12, 4740, 2755, 2724, 8, 324, 5406, 2, 18, 202, 30455, 4058, 5787, 8331, 3]",0


In [16]:
def build_embedding_matrix(vocab=vocab, embedding_model=glove, embedding_dimension=50) :
  matrix = [np.zeros((embedding_dimension))] # first element reserved to padding and set to all zeros
  for w in vocab :
    if w in embedding_model.vocab :
      matrix.append(embedding_model[w])
    else:
      matrix.append(np.random.uniform(low=-1.0, high=1.0, size=embedding_dimension))
  return np.array(matrix)

In [17]:
embedding_matrix = build_embedding_matrix(vocab, glove, 50)
embedding_matrix.shape

(403507, 50)

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
max_length = pad_sequences(enc_train_df['Evidence'], padding='post').shape[1]

In [20]:
enc_val_df = encode_df(tokenize_df(val_df), word_to_idx)
enc_test_df = encode_df(tokenize_df(test_df), word_to_idx)

In [21]:
pad_train_claim = pad_sequences(enc_train_df['Claim'], maxlen=max_length, padding='post')
pad_train_evidence = pad_sequences(enc_train_df['Evidence'], maxlen=max_length, padding='post')

pad_val_claim = pad_sequences(enc_val_df['Claim'], maxlen=max_length, padding='post')
pad_val_evidence = pad_sequences(enc_val_df['Evidence'], maxlen=max_length, padding='post')

pad_test_claim = pad_sequences(enc_test_df['Claim'], maxlen=max_length, padding='post')
pad_test_evidence = pad_sequences(enc_test_df['Evidence'], maxlen=max_length, padding='post')

In [22]:
pad_train_claim.shape, pad_train_evidence.shape

((121740, 144), (121740, 144))

## Model definition

In [23]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Embedding, Bidirectional, \
                                    LSTM, GRU, Lambda, GlobalMaxPooling1D, GlobalAveragePooling1D, \
                                    Concatenate, Add, Average, Dropout, Flatten, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping
#from tensorflow.regularizers import L2
from tensorflow.keras.optimizers import SGD, Adam

from sklearn.metrics.pairwise import cosine_similarity

# detect and init the TPU
#tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#tf.config.experimental_connect_to_cluster(tpu)
#tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
#tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

tf.random.set_seed(42)

In [24]:
def build_embedding_layer(embedding_matrix, name=None) :
    embedding_layer = Embedding(
        embedding_matrix.shape[0],    # vocab size 
        embedding_matrix.shape[1],    # embedding dimension
        #embeddings_initializer=tf.initializers.Constant(embedding_matrix),
        weights = [embedding_matrix],
        mask_zero = True,
        name = name,
        trainable = False
    )
    return embedding_layer

In [25]:
def build_sent_emb(units, multi_input='average', name=None) :
  sent_emb = Sequential(name=name)
  if multi_input == 'last_state' :
    sent_emb.add(Bidirectional(LSTM(units, return_sequences=False)))
  elif multi_input == 'average' :
    sent_emb.add(Bidirectional(LSTM(units, return_sequences=True)))
    sent_emb.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
  elif multi_input == 'max' :
    sent_emb.add(Bidirectional(LSTM(units, return_sequences=True)))
    sent_emb.add(Lambda(lambda x: tf.reduce_max(x, axis=1)))
  elif multi_input == 'mlp':
    sent_emb.add(Flatten())
    sent_emb.add(Dense(144, activation='relu'))
    sent_emb.add(Dense(50, activation='relu'))
  elif multi_input == 'bag_of_vectors' :
    sent_emb.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))

  return sent_emb

In [27]:
tf.keras.backend.clear_session()
from keras import backend as K

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

def build_model(embedding_matrix, max_length, multi_input='average', merge_mode='concat', cos_sim_feature=False) :

  claim = Input((max_length,), name='claim')
  evidence = Input((max_length,), name='evidence')

  embedding_layer = build_embedding_layer(embedding_matrix, "glove_embedding")  
    
  embedding_c = embedding_layer(claim)
  embedding_e = embedding_layer(evidence)

  sent_emb_c = build_sent_emb(max_length, multi_input=multi_input, name="sent_emb_claim") (embedding_c)
  sent_emb_e = build_sent_emb(max_length, multi_input=multi_input, name="sent_emb_evidence") (embedding_e)

  #output = build_sent_emb(64,concat)

  if merge_mode == 'concat' :
    output = Concatenate(name='refined_input')([sent_emb_c, sent_emb_e])    # option 1
  elif merge_mode == 'sum' :
    output = Add()([sent_emb_c, sent_emb_e])                                # option 2
  elif merge_mode == 'mean' :
    output = Average()([sent_emb_c, sent_emb_e])                            # option 3
  
  if cos_sim_feature :
    distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([sent_emb_c, sent_emb_e])
    output = Concatenate(name='cossim_refined_input')([output, distance])          # co sim
  output = Dense(max_length/2, activation='relu')(output)
  output = Dropout(0.5)(output)
  output = Dense(max_length/(2**2), activation='relu')(output)
  #output = Dropout(0.5)(output)
  output = Dense(max_length/(2**3), activation='relu')(output)
  #output = Dropout(0.5)(output)
  output = Dense(1, activation='softmax')(output)
  return Model([claim, evidence], output)

model = None
#with tpu_strategy.scope():
model = build_model(embedding_matrix, max_length, multi_input='max', merge_mode='mean', cos_sim_feature=True)

## Training

In [28]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
claim (InputLayer)              [(None, 144)]        0                                            
__________________________________________________________________________________________________
evidence (InputLayer)           [(None, 144)]        0                                            
__________________________________________________________________________________________________
glove_embedding (Embedding)     (None, 144, 50)      20175350    claim[0][0]                      
                                                                 evidence[0][0]                   
__________________________________________________________________________________________________
sent_emb_claim (Sequential)     (None, 288)          224640      glove_embedding[0][0]        

In [29]:
model.fit(x=(pad_train_claim, pad_train_evidence), 
          y=enc_train_df['Label'],
          batch_size=64,
          epochs=1,
          #steps_per_epoch=5,
          validation_data=((pad_val_claim, pad_val_evidence), enc_val_df['Label']),
          callbacks = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
          #use_multiprocessing = True
          )

Train on 121740 samples, validate on 7165 samples


TypeError: can only concatenate list (not "EarlyStopping") to list

In [None]:
# evaluate the model
loss, accuracy, f1_score, precision, recall = model.evaluate((pad_test_claim, pad_test_evidence), enc_test_df["Label"], verbose=0)
print("Accuracy: {}".format(accuracy))
print("F1-score: {}".format(f1_score))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))

In [44]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import nltk
nltk.download('punkt')

def get_true_labels(df_in):
  acc = pd.DataFrame(columns=["Claim", "#Supports", "#Refutes"])
  for _, row in df_in.iterrows():
    if row["Claim"] in acc["Claim"].tolist():
      index = acc["Claim"].tolist().index(row["Claim"])
      acc = acc.reset_index(drop=True)
      if row["Label"] == "SUPPORTS":
        acc['#Supports'][index] = int(acc["#Supports"][index]) + 1
      else:
        acc['#Refutes'][index] = int(acc["#Refutes"][index]) + 1
    else:
      if row["Label"] == "SUPPORTS":
        tmp = pd.DataFrame([[row["Claim"], 1, 0]], columns=["Claim", "#Supports", "#Refutes"])
      else:
        tmp = pd.DataFrame([[row["Claim"], 0, 1]], columns=["Claim", "#Supports", "#Refutes"])
      acc = acc.append(tmp)

  claims_labels_dict = {}
  for _, row in acc.iterrows():
    if row["#Supports"] > row["#Refutes"]:
      claims_labels_dict["".join(nltk.word_tokenize(row["Claim"]))] = True
    else:
      claims_labels_dict["".join(nltk.word_tokenize(row["Claim"]))] = False

  return claims_labels_dict

def get_pred_labels(model, test_df, pad_test_claim, pad_test_evidence):
    predictions = model.predict((pad_test_claim, pad_test_evidence))
    claims_labels_dict = {}
    claims_flat = []
    for claim in test_df['Claim']:
      claim = claim.replace(" ", "")
      claim = claim.replace("\"", "")
      claim = claim.replace("\'", "")
      if claim:
        claims_flat.append(''.join(claim))

    for index, claim in enumerate(claims_flat):
      if claim in claims_labels_dict:
        if predictions[index]:
          claims_labels_dict[claim] = (claims_labels_dict[claim][0],
                                        claims_labels_dict[claim][1] + 1)
        else:
          claims_labels_dict[claim] = (claims_labels_dict[claim][0] + 1,
                                        claims_labels_dict[claim][1])
      else:
        if predictions[index]:
          claims_labels_dict[claim] = (0, 1)
        else:
          claims_labels_dict[claim] = (1, 0)
                                       
    for claim, votes in claims_labels_dict.items():
      if votes[0] > votes[1]:
        claims_labels_dict[claim] = False
      else:
        claims_labels_dict[claim] = True
    return claims_labels_dict

def compute_metrics_majority_voting(true, pred):
  true_labels = []
  pred_labels = []
  for claim, label in true.items():
    if claim in pred:
      true_labels.append(label)
      claim = claim.replace("`", "")
      claim = claim.replace("\'", "")
      pred_labels.append(pred[claim])
  print("Accuracy:  ", accuracy_score(true_labels, pred_labels))
  print("F1 score:  ", f1_score(true_labels, pred_labels))
  print("Precision: ", precision_score(true_labels, pred_labels))
  print("Recall:    ", recall_score(true_labels, pred_labels))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
trues = get_true_labels(test_df)
preds = get_pred_labels(model, test_df, pad_test_claim, pad_test_evidence)

In [46]:
compute_metrics_majority_voting(trues, preds)

Accuracy:   0.5000846740050804
F1 score:   0.6667419282004967
Precision:  0.5000846740050804
Recall:     1.0


## Evaluation

## Debug

In [None]:
# Example 4
id1 = 0
#id1 = 78061
#id1 = 7612
v1 = sent_emb(pad_sequences(enc_train_df['Claim'], maxlen=max_length, padding='post')[id1].reshape(1, -1))
v2 = sent_emb(pad_sequences(enc_train_df['Evidence'], maxlen=max_length, padding='post')[id1].reshape(1, -1))
sim = cosine_similarity(v1, v2)
print(f'similarity between \"{train_df["Claim"][id1]}\" and \"{train_df["Evidence"][id1]}\":')
print(sim)

In [None]:
# Example 1
id1 = 7612
id2 = 78061
v1 = sent_emb(pad_sequences(enc_train_df['Claim'], padding='post')[id1].reshape(1, -1))
v2 = sent_emb(pad_sequences(enc_train_df['Claim'], padding='post')[id2].reshape(1, -1))
sim = cosine_similarity(v1, v2)
print(f'similarity between \"{train_df["Claim"][id1]}\" and \"{train_df["Claim"][id2]}\":')
print(sim)

In [None]:
# Example 2
id1 = 1
id2 = 2
v1 = sent_emb(pad_sequences(enc_train_df['Claim'], padding='post')[id1].reshape(1, -1))
v2 = sent_emb(pad_sequences(enc_train_df['Claim'], padding='post')[id2].reshape(1, -1))
sim = cosine_similarity(v1, v2)
print(f'similarity between \"{train_df["Claim"][id1]}\" and \"{train_df["Claim"][id2]}\":')
print(sim)
#embedding_layer(pad_sequences(enc_train_df['Claim'], padding='post')[0].reshape(1, -1)).shape

In [None]:
# Example 3
id1 = 0
id2 = 3
v1 = sent_emb(pad_sequences(enc_train_df['Claim'], padding='post')[id1].reshape(1, -1))
v2 = sent_emb(pad_sequences(enc_train_df['Claim'], padding='post')[id2].reshape(1, -1))
sim = cosine_similarity(v1, v2)
print(f'similarity between \"{train_df["Claim"][id1]}\" and \"{train_df["Claim"][id2]}\":')
print(sim)
#embedding_layer(pad_sequences(enc_train_df['Claim'], padding='post')[0].reshape(1, -1)).shape

In [None]:
glove.similarity('american', 'swedish')

In [None]:
len(enc_train_df["Evidence"][3])

In [None]:
# debug
train_df[train_df["Evidence"].str.contains("Swift")]