## FEVER: Fact Extraction and VERification

In [1]:
import numpy as np
import scipy.sparse as sp
import pandas as pd
import os
import json

from tqdm import tqdm
from collections import Counter
from itertools import product
from sklearn.linear_model import LogisticRegression

import fever
import utils

### Functionalities of Oracle Class

In [2]:
DB_PATH = 'data/single/fever0.db'
MAT_PATH = 'data/index/tfidf-count-ngram=1-hash=16777216.npz'

In [3]:
oracle = fever.Oracle()

In [4]:
query = 'Tetris has sold millions of physical copies.'

In [6]:
oracle.closest_docs(query, k=4)

['Tetris',
 'Jolin_Tsai_discography',
 'List_of_best-selling_Game_Boy_video_games',
 'Eminem_discography']

In [11]:
oracle.doc_ids2texts(['Tetris'])

["Tetris -LRB- , pronounced -LSB- ˈtɛtrʲɪs -RSB- -RRB- is a tile-matching puzzle video game , originally designed and programmed by Russian game designer Alexey Pajitnov . It was released on June 6 , 1984 , while he was working for the Dorodnitsyn Computing Centre of the Academy of Science of the USSR in Moscow . He derived its name from the Greek numerical prefix tetra - -LRB- all of the game 's pieces contain four segments -RRB- and tennis , Pajitnov 's favorite sport .   Tetris was the first entertainment software to be exported from the USSR to the US , where it was published by Spectrum HoloByte for Commodore 64 and IBM PC . The Tetris game is a popular use of tetrominoes , the four-element special case of polyominoes . Polyominoes have been used in popular puzzles since at least 1907 , and the name was given by the mathematician Solomon W. Golomb in 1953 . However , even the enumeration of pentominoes is dated to antiquity .   The game -LRB- or one of its many variants -RRB- is a

In [12]:
oracle.get_sentence('Tetris', 0)

'Tetris -LRB- , pronounced -LSB- ˈtɛtrʲɪs -RSB- -RRB- is a tile-matching puzzle video game , originally designed and programmed by Russian game designer Alexey Pajitnov .'

In [14]:
oracle.choose_sents_from_doc_ids(query, oracle.closest_docs(query, k=4), k=3)

{('Jolin_Tsai_discography',
  9): 'Her next release under Sony , Magic -LRB- 2003 -RRB- , was heralded as her comeback album , which sold more than 1.5 million copies in Asia , with more than 360,000 copies sold in Taiwan alone , and the album made her the best-selling female singer of the year in Taiwan .',
 ('Jolin_Tsai_discography',
  11): 'The album has sold over 2 million copies in Asia , with 300,000 copies sold in Taiwan alone , and made her the best-selling female singer of the year in Taiwan .',
 ('Tetris',
  12): 'In January 2010 , it was announced that the Tetris franchise had sold more than 170 million copies , approximately 70 million physical copies and over 100 million copies for cell phones , making it the best selling paid-downloaded game of all time .'}

In [16]:
oracle.read(query)

{('Jolin_Tsai_discography',
  9): 'Her next release under Sony , Magic -LRB- 2003 -RRB- , was heralded as her comeback album , which sold more than 1.5 million copies in Asia , with more than 360,000 copies sold in Taiwan alone , and the album made her the best-selling female singer of the year in Taiwan .',
 ('Jolin_Tsai_discography',
  11): 'The album has sold over 2 million copies in Asia , with 300,000 copies sold in Taiwan alone , and made her the best-selling female singer of the year in Taiwan .',
 ('Tetris',
  12): 'In January 2010 , it was announced that the Tetris franchise had sold more than 170 million copies , approximately 70 million physical copies and over 100 million copies for cell phones , making it the best selling paid-downloaded game of all time .'}

### Dataset Structure

In [17]:
fever_iterator = iter(fever.TrainReader().read())

In [18]:
fever_ex = next(fever_iterator)

In [19]:
print(fever_ex)

Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
VERIFIABLE
SUPPORTS


In [20]:
fever_ex

"FEVER Example({'id': 75397, 'verifiable': 'VERIFIABLE', 'label': 'SUPPORTS', 'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'evidence': [[[92206, 104971, 'Nikolaj_Coster-Waldau', 7], [92206, 104971, 'Fox_Broadcasting_Company', 0]]]})

In [21]:
fever_ex.get_evidence_ids()

[('Nikolaj_Coster-Waldau', 7), ('Fox_Broadcasting_Company', 0)]

In [22]:
fever_labels = pd.Series(
    [ex.label for ex in fever.TrainReader().read()])

In [23]:
fever_labels.value_counts()

SUPPORTS           80035
NOT ENOUGH INFO    35639
REFUTES            29775
dtype: int64

In [24]:
fever_labels = pd.Series(
    [ex.label for ex in fever.DevReader().read()])

In [25]:
fever_labels.value_counts()

NOT ENOUGH INFO    3333
SUPPORTS           3333
REFUTES            3333
dtype: int64

### Test on Document Retrieval and Sentence Selection 

In [None]:
# for tfidf

In [21]:
for num_docs in [1,3,5,10]:
    fever.doc_retrieval_accuracy(reader=fever.TrainReader(samp_percentage=0.05),
                                oracle=oracle,
                                num_docs=num_docs)

Reading from dataset: 7335examples [25:53,  4.72examples/s]                    


Num_docs = 1, accuracy 1283/5524 = 0.23225923244026067


Reading from dataset:  99%|█████████▊| 7261/7362 [24:51<00:20,  4.87examples/s]


Num_docs = 3, accuracy 2491/5469 = 0.4554763210824648


Reading from dataset: 100%|█████████▉| 7269/7289 [25:29<00:04,  4.75examples/s]


Num_docs = 5, accuracy 3131/5503 = 0.5689623841540977


Reading from dataset: 7236examples [25:27,  4.74examples/s]                    


Num_docs = 10, accuracy 3783/5486 = 0.6895734597156398


In [22]:
for num_sents in [1,3,5,10]:
    fever.sentence_selection_accuracy(reader=fever.TrainReader(samp_percentage=0.05),
                                oracle=oracle,
                                num_sents=num_sents)

Reading from dataset: 7360examples [05:09, 23.81examples/s]                    


Num_sents = 1, accuracy 2810/5492 = 0.5116533139111434


Reading from dataset: 7382examples [05:07, 23.99examples/s]                    


Num_sents = 3, accuracy 3727/5561 = 0.6702032008631541


Reading from dataset: 7229examples [04:59, 24.16examples/s]                    


Num_sents = 5, accuracy 3941/5424 = 0.7265855457227138


Reading from dataset: 7358examples [05:04, 24.13examples/s]                    

Num_sents = 10, accuracy 4511/5514 = 0.8180993833877402





In [None]:
# for pmi

In [21]:
for num_docs in [1,3,5,10]:
    fever.doc_retrieval_accuracy(reader=fever.TrainReader(samp_percentage=0.05),
                                oracle=oracle,
                                num_docs=num_docs)

Reading from dataset: 100%|█████████▉| 7235/7241 [31:58<00:01,  3.77examples/s]


Num_docs = 1, accuracy 1195/5414 = 0.22072404876246768


Reading from dataset: 7346examples [28:28,  4.30examples/s]                    


Num_docs = 3, accuracy 2572/5564 = 0.46225736879942486


Reading from dataset:  98%|█████████▊| 7262/7373 [28:24<00:26,  4.26examples/s]


Num_docs = 5, accuracy 3128/5527 = 0.5659489777456125


Reading from dataset:  98%|█████████▊| 7221/7338 [27:09<00:26,  4.43examples/s]

Num_docs = 10, accuracy 3776/5482 = 0.6887997081357169





In [22]:
for num_sents in [1,3,5,10]:
    fever.sentence_selection_accuracy(reader=fever.TrainReader(samp_percentage=0.05),
                                oracle=oracle,
                                num_sents=num_sents)

Reading from dataset: 7384examples [05:10, 23.76examples/s]                    


Num_sents = 1, accuracy 2845/5512 = 0.5161465892597968


Reading from dataset:  97%|█████████▋| 7216/7446 [05:00<00:09, 23.99examples/s]


Num_sents = 3, accuracy 3605/5400 = 0.6675925925925926


Reading from dataset: 7232examples [04:59, 24.15examples/s]                    


Num_sents = 5, accuracy 4012/5391 = 0.7442033017992952


Reading from dataset:  99%|█████████▉| 7239/7285 [05:11<00:01, 23.23examples/s]

Num_sents = 10, accuracy 4436/5467 = 0.8114139381745016





### Sampling for NotEnoughInfo class

In [43]:
def sampling_for_NEI(oracle, num_docs=5, num_sents=5):
    names = ['training','dev','test']
    for name in names:
        print('Working on {} split'.format(name))
        original_path = 'data/fever-data/{}.jsonl'.format(name)
        sampling_path = 'data/fever-data/{}_sampled.jsonl'.format(name)
        with open(original_path, "r") as f:
            with open(sampling_path, "w+") as f2:
                for line in tqdm(f.readlines()):
                    line = json.loads(line)

                    if name == 'dev' or name == 'test' or line["label"] == "NOT ENOUGH INFO":
                        evidences = oracle.read(line['claim'], num_docs=num_docs, num_sents=num_sents).keys()
                        line['evidence'] = [[[0,0,ev[0],ev[1]] for ev in evidences]]

                    f2.write(json.dumps(line) + "\n")

In [44]:
sampling_for_NEI(oracle)

  0%|          | 0/9999 [00:00<?, ?it/s]

Working on dev split


100%|██████████| 9999/9999 [1:18:30<00:00,  2.12it/s]
  0%|          | 0/9999 [00:00<?, ?it/s]

Working on test split


100%|██████████| 9999/9999 [1:14:08<00:00,  2.25it/s]


### RTE Training and Test

In [16]:
def word_overlap_phi(claim, evidence):    
    """Basis for features for the words in both the premise and hypothesis.
    This tends to produce very sparse representations.
    
    Parameters
    ----------
    claim : a string
    evidence : a list of sentences
    
    Returns
    -------
    defaultdict
       Maps each word in both claim and evidence to 1.
    
    """
    sents=[]
    for sent in evidence:
        sents.extend(utils.process_sent(sent))
    overlap = set([w1 for w1 in utils.process_text(claim) if w1 in sents])
    return Counter(overlap)

In [17]:
def fit_maxent_classifier(X, y):    
    """Wrapper for `sklearn.linear.model.LogisticRegression`. This is also 
    called a Maximum Entropy (MaxEnt) Classifier, which is more fitting 
    for the multiclass case.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.
    y : list
        The list of labels for rows in `X`.
    
    Returns
    -------
    sklearn.linear.model.LogisticRegression
        A trained `LogisticRegression` instance.
    
    """
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [23]:
percentage = 0.1

In [38]:
dataset = fever.build_dataset(fever.SampledTrainReader(samp_percentage=percentage), 
                              word_overlap_phi, oracle)

Reading from dataset:  98%|█████████▊| 2885/2949 [00:33<00:00, 85.58examples/s]


In [24]:
_ = fever.experiment(
    train_reader=fever.SampledTrainReader(samp_percentage=percentage), 
    phi=word_overlap_phi,
    oracle=oracle,
    train_func=fit_maxent_classifier,
    assess_reader=fever.SampledDevReader(),
    random_state=42)

Reading from dataset:  98%|█████████▊| 14288/14586 [03:02<00:03, 78.15examples/s]
Reading from dataset: 1070examples [00:38, 28.13examples/s]                    


                 precision    recall  f1-score   support

NOT ENOUGH INFO      0.337     0.176     0.232       357
        REFUTES      0.406     0.037     0.068       351
       SUPPORTS      0.342     0.804     0.480       362

    avg / total      0.361     0.343     0.262      1070



In [18]:
def word_cross_product_phi(claim, evidence):
    """Basis for cross-product features. This tends to produce pretty 
    dense representations.
    
    Parameters
    ----------
    claim : a string
    evidence : a list of sentences
        
    Returns
    -------
    defaultdict
        Maps each (w1, w2) in the cross-product of words in claim and 
        evidence to its count. This is a multi-set cross-product
        (repetitions matter).
    
    """
    sents=[]
    for sent in evidence:
        sents.extend(utils.process_sent(sent))
    return Counter([(w1, w2) for w1, w2 in product(utils.process_text(claim), sents)])

In [26]:
_ = fever.experiment(
    train_reader=fever.SampledTrainReader(samp_percentage=percentage), 
    phi=word_cross_product_phi,
    oracle=oracle,
    train_func=fit_maxent_classifier,
    assess_reader=fever.SampledDevReader(),
    random_state=42)

Reading from dataset: 14595examples [03:05, 78.76examples/s]                     
Reading from dataset: 1036examples [00:40, 25.58examples/s]                  


                 precision    recall  f1-score   support

NOT ENOUGH INFO      0.336     0.351     0.343       342
        REFUTES      0.465     0.244     0.320       328
       SUPPORTS      0.394     0.546     0.458       366

    avg / total      0.398     0.386     0.377      1036



In [19]:
def fit_maxent_with_crossvalidation(X, y):
    """A MaxEnt model of dataset with hyperparameter cross-validation.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.
        
    y : list
        The list of labels for rows in `X`.   
    
    Returns
    -------
    sklearn.linear_model.LogisticRegression
        A trained model instance, the best model found.
    
    """    
    basemod = LogisticRegression(fit_intercept=True)
    cv = 3
    param_grid = {'C': [0.4, 0.6, 0.8, 1.0],
                  'penalty': ['l1','l2']}    
    return fever.fit_classifier_with_crossvalidation(X, y, basemod, cv, param_grid)

In [20]:
_ = fever.experiment(
    train_reader=fever.SampledTrainReader(), 
    phi=word_overlap_phi,
    oracle=oracle,
    train_func=fit_maxent_with_crossvalidation,
    assess_reader=fever.SampledDevReader())

Reading from dataset: 100%|██████████| 145449/145449 [24:46<00:00, 97.85examples/s]
Reading from dataset: 100%|██████████| 9999/9999 [05:46<00:00, 28.87examples/s]


Best params {'C': 1.0, 'penalty': 'l2'}
Best score: 0.430
                 precision    recall  f1-score   support

NOT ENOUGH INFO      0.362     0.326     0.343      3333
        REFUTES      0.426     0.012     0.023      3333
       SUPPORTS      0.337     0.698     0.455      3333

    avg / total      0.375     0.346     0.274      9999



In [21]:
percentage = 0.2

In [22]:
_ = fever.experiment(
    train_reader=fever.SampledTrainReader(samp_percentage=percentage), 
    phi=word_cross_product_phi,
    oracle=oracle,
    train_func=fit_maxent_with_crossvalidation,
    assess_reader=fever.SampledDevReader(),
    random_state=42)

Reading from dataset: 29153examples [05:16, 92.02examples/s]                     
Reading from dataset: 100%|██████████| 9999/9999 [06:15<00:00, 26.61examples/s]


Best params {'C': 1.0, 'penalty': 'l1'}
Best score: 0.612
                 precision    recall  f1-score   support

NOT ENOUGH INFO      0.349     0.527     0.420      3333
        REFUTES      0.535     0.219     0.311      3333
       SUPPORTS      0.378     0.410     0.394      3333

    avg / total      0.421     0.385     0.375      9999

