In [1]:
import time
import pickle
import argparse
import torch

from torch.utils.data import DataLoader
from esim.data import NLIDataset
from esim.model import ESIM
from esim.utils import correct_predictions

In [2]:
test_data='/home/rongz/ESIM/data/preprocessed/quora/test_data.pkl'
checkpoint='/home/rongz/ESIM/data/checkpoints/quora/best.pth.tar'
batch_size=32

In [3]:
with open(test_data, "rb") as pkl:
    test_data = NLIDataset(pickle.load(pkl))

In [4]:
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [5]:
for batch in test_loader:
    print(batch)
    break

{'id': ['50018', '126924', '391187', '301889', '202497', '75122', '374364', '84209', '179700', '48882', '195953', '306946', '324676', '246367', '228987', '242869', '112473', '227323', '35973', '320170', '180350', '325484', '84643', '238134', '53437', '338378', '101440', '54980', '234913', '10882', '326051', '211707'], 'premise': tensor([[  2,   6,  43,  ...,   0,   0,   0],
        [  2,  76, 206,  ...,   0,   0,   0],
        [  2,  11,  20,  ...,   0,   0,   0],
        ...,
        [  2,  11, 162,  ...,   0,   0,   0],
        [  2,   6,   8,  ...,   0,   0,   0],
        [  2,   6,   8,  ...,   0,   0,   0]]), 'premise_length': tensor([12, 12,  9,  9, 11, 13, 18,  8, 11, 11, 16,  9, 14, 23,  9, 14, 10,  8,
        13, 14, 11, 25, 10, 10, 12, 11, 14, 15,  9, 11, 13, 17]), 'hypothesis': tensor([[   2,   11,   14,  ...,    0,    0,    0],
        [   2,   76,  206,  ...,    0,    0,    0],
        [   2,   11,   14,  ...,    0,    0,    0],
        ...,
        [   2, 6813,  782,  ...

In [6]:
def main(test_file, pretrained_file, batch_size=32):
    """
    Test the ESIM model with pretrained weights on some dataset.

    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    checkpoint = torch.load(pretrained_file)

    # Retrieving model parameters from checkpoint.
    vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
    hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading test data...")
    with open(test_file, "rb") as pkl:
        test_data = NLIDataset(pickle.load(pkl))

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(vocab_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)

    model.load_state_dict(checkpoint["model"])

    print(20 * "=",
          " Testing ESIM model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy, predict_df = test(model, test_loader)

    print("-> Average batch processing time: {:.4f}s, total test time:\
 {:.4f}s, accuracy: {:.4f}%".format(batch_time, total_time, (accuracy*100)))
    return predict_df

In [7]:
from sklearn import metrics

In [8]:
import pandas as pd

In [9]:
def test(model, dataloader):
    """
    Test the accuracy of a model on some labelled test dataset.

    Args:
        model: The torch module on which testing must be performed.
        dataloader: A DataLoader object to iterate over some dataset.

    Returns:
        batch_time: The average time to predict the classes of a batch.
        total_time: The total time to process the whole dataset.
        accuracy: The accuracy of the model on the input data.
    """
    # Switch the model to eval mode.
    model.eval()
    device = model.device

    time_start = time.time()
    batch_time = 0.0
    accuracy = 0.0
    
    all_ids=[]
    all_labels=[]
    all_out_classes=[]

    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for batch in dataloader:
            batch_start = time.time()

            # Move input and output data to the GPU if one is used.
            ids=batch["id"]
            premises = batch["premise"].to(device)
            premises_lengths = batch["premise_length"].to(device)
            hypotheses = batch["hypothesis"].to(device)
            hypotheses_lengths = batch["hypothesis_length"].to(device)
            labels = batch["label"]
            all_labels.extend(labels.tolist())
            labels=labels.to(device)

            _, probs = model(premises,
                             premises_lengths,
                             hypotheses,
                             hypotheses_lengths)
            _, out_classes = probs.max(dim=1)
            all_out_classes.extend(out_classes.tolist())

            accuracy += correct_predictions(probs, labels)
            batch_time += time.time() - batch_start
            all_ids.extend(ids)

    predict_df=pd.DataFrame({'id':all_ids,'predict':all_out_classes})
    
    batch_time /= len(dataloader)
    total_time = time.time() - time_start
    accuracy /= (len(dataloader.dataset))
    print(metrics.accuracy_score(all_labels,all_out_classes))
    print(metrics.precision_score(all_labels,all_out_classes))
    print(metrics.recall_score(all_labels,all_out_classes))
    print(metrics.f1_score(all_labels,all_out_classes))

    return batch_time, total_time, accuracy, predict_df

In [10]:
predict_df=main(test_data,checkpoint,batch_size)

	* Loading test data...


TypeError: expected str, bytes or os.PathLike object, not NLIDataset

In [None]:
origin_test_data='/home/rongz/ESIM/data/dataset/quora/test.tsv'
origin_test_df=pd.read_csv(origin_test_data,header=None,delimiter="\t")

In [None]:
origin_test_df[4]=predict_df['predict']

In [None]:
origin_test_df.head()

In [None]:
# change the colume order
origin_test_df=origin_test_df[[3,1,2,0,4]]

In [None]:
origin_test_df.head()

In [None]:
origin_predict_path='/home/rongz/ESIM/data/predict/quora/test.csv'

In [None]:
origin_test_df.to_csv(origin_predict_path, header=['id','sent1','sent2','label','predict'],index=False)

In [None]:
true_positive_df=origin_test_df[(origin_test_df[0]==1) & (origin_test_df[4]==1)]
true_negative_df=origin_test_df[(origin_test_df[0]==0) & (origin_test_df[4]==0)]
false_positive_df=origin_test_df[(origin_test_df[0]==0) & (origin_test_df[4]==1)]
false_negative_df=origin_test_df[(origin_test_df[0]==1) & (origin_test_df[4]==0)]

In [None]:
true_positive_path='/home/rongz/ESIM/data/predict/quora/test_tp.csv'
true_negative_path='/home/rongz/ESIM/data/predict/quora/test_tn.csv'
false_positive_path='/home/rongz/ESIM/data/predict/quora/test_fp.csv'
false_negative_path='/home/rongz/ESIM/data/predict/quora/test_fn.csv'

In [None]:
true_positive_df.to_csv(true_positive_path, header=['id','sent1','sent2','label','predict'],index=False)
true_negative_df.to_csv(true_negative_path, header=['id','sent1','sent2','label','predict'],index=False)
false_positive_df.to_csv(false_positive_path, header=['id','sent1','sent2','label','predict'],index=False)
false_negative_df.to_csv(false_negative_path, header=['id','sent1','sent2','label','predict'],index=False)