In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import json
from tqdm import tqdm
import random
import pickle
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import SequentialSampler, TensorDataset, RandomSampler
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import torch
import torch.nn as nn
from datasets import load_dataset
import time
from torch.utils.data import DataLoader

2023-02-20 10:12:41.176355: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-20 10:12:41.372906: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-02-20 10:12:42.019409: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.6/lib64:/usr/local/cuda-11.6/lib64
2023-02-20 10:12:42.019482: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: 

In [2]:
from mnli_pre_training import EntailModel

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "2,3"

In [4]:
val_cats = json.load(open("../data/value-categories.json"))
tags = ["training", "validation"]
data_dict = {}
ratio_hard = 0.5

In [5]:
for tag in tags:
    data_dict[tag] = {}
    arg_df = pd.read_csv("../data/arguments-"+tag+".tsv", sep="\t")
    label_df = pd.read_csv("../data/labels-"+tag+".tsv", sep="\t")
    level1_label_df = pd.read_csv("../data/level1-labels-"+tag+".tsv", sep="\t")
    merged_df = arg_df.merge(label_df, how="inner", on ="Argument ID").merge(level1_label_df, 
                                                                             how="inner", 
                                                                             on ="Argument ID").reset_index(drop=True)
    print(arg_df.shape, label_df.shape, level1_label_df.shape, merged_df.shape)
#     merged_df.head(2)

    labels = [i for i in label_df.columns if i != 'Argument ID']
    level_1 = [i for i in level1_label_df.columns if i != 'Argument ID']
#     len(labels), len(level_1)

    option_map = {}
    for ix, row in merged_df.iterrows():
        options = {}
        used = []
        for l in labels:
            tmp = {}
            if row[l] == 1:
                for l1 in val_cats[l].keys():
                    if row[l1] == 1:
                        tmp[l1] = val_cats[l][l1]
                        used.extend([l, l1])
                options[l] = tmp
        all_tagged = set([c for c in labels + level_1 if row[c] == 1])
        assert len(all_tagged.difference(set(used))) == 0
        option_map[row["Argument ID"]] = options
#     len(option_map)

    for ix, row in tqdm(merged_df.iterrows()):
        dct = {"id": row["Argument ID"], "stance": row["Stance"], "premise": row["Premise"], 
               "conclusion": row["Conclusion"], "labels": list(option_map[row["Argument ID"]].keys())}
        stance = " against. " if dct["stance"] == "against" else " in favor of. "
        dct["sent"] = dct["premise"] + stance + dct["conclusion"]
        dct["opts"] = list(set([k2 + " by " + i for k, v in option_map[dct["id"]].items() for k2, v2 in v.items() for i in v2]))

        na_options_hard, na_options_easy = [], []
        for k, v in option_map[dct["id"]].items():
            l1_present = set(v.keys())
            l1_all = set(val_cats[k].keys())
            assert len(l1_all) >= len(l1_present)
            l1_not_present = l1_all.difference(l1_present)
            na_options_hard.extend([i + " by " + j for i in list(l1_not_present) for j in val_cats[k][i]])

        na_options_easy = [k + " by " + j for l in set(labels).difference(set(dct["labels"])) 
                           for k, v in val_cats[l].items() 
                           for j in v]
        random.shuffle(na_options_hard)
        random.shuffle(na_options_easy)

        hard_opts = na_options_hard[:int(len(dct["opts"])*ratio_hard)]
        easy_opts = na_options_easy[:(len(dct["opts"]) - len(hard_opts))]
        assert len(hard_opts) + len(easy_opts) == len(dct["opts"])
        dct["adverse_hard_opts"], dct["adverse_easy_opts"] = hard_opts, easy_opts
        data_dict[tag][row["Argument ID"]] = dct
#     break
        

(5393, 4) (5393, 21) (5393, 55) (5393, 78)


5393it [00:00, 5619.60it/s]


(1896, 4) (1896, 21) (1896, 55) (1896, 78)


1896it [00:00, 5696.55it/s]


(100, 4) (100, 21) (100, 55) (100, 78)


100it [00:00, 4874.60it/s]


In [6]:
pickle.dump(data_dict, open("../data/data_dict_raw.pkl", "wb"))

### Pre-Training on multi_nli

In [25]:
multi_nli = load_dataset("multi_nli")

Using custom data configuration default
Reusing dataset multi_nli (/home/csgrad/sougatas/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


  0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
train_examples = [[i["premise"], i["hypothesis"], 1 if i["label"] == 0 else 0] for i in multi_nli["train"]]
val1_examples = [[i["premise"], i["hypothesis"], 1 if i["label"] == 0 else 0] for i in multi_nli["validation_matched"]]
val2_examples = [[i["premise"], i["hypothesis"], 1 if i["label"] == 0 else 0] for i in multi_nli["validation_mismatched"]]
val_examples = val1_examples + val2_examples
len(train_examples), len(val1_examples), len(val2_examples), len(val_examples)


(392702, 9815, 9832, 19647)

In [27]:
train_examples[0]

['Conceptually cream skimming has two basic dimensions - product and geography.',
 'Product and geography are what make cream skimming work. ',
 0]

In [5]:
class EntailModel(nn.Module):
    def __init__(self, base_model, tokenizer):
        super().__init__()
        self.base_model = base_model
        self.tokenizer = tokenizer
        self.ff = nn.Linear(1024, 1) # nn.Linear(768, 1)

    def forward(self, input_ids):
        attention_mask = (input_ids != self.tokenizer.pad_token_id).long().to(input_ids.device)
        op = self.base_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        return self.ff(op["pooler_output"])

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta = RobertaModel.from_pretrained("roberta-large")#("roberta-base")

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def pad_sequence(pad, batch):
    maxlen = max([len(i) for i in batch])
    lst = []
    for i in batch:
        lst.append(i + [pad] * (maxlen - len(i)))
    return lst

def format_examples(lst):
    input_ids, labels = [], []
    for record in tqdm(lst):
        input_ids.append(tokenizer(record[0]).input_ids + tokenizer(record[1]).input_ids)
        if len(record) > 2:
            labels.append(record[-1])
    if len(labels) > 0:
        return (pad_sequence(tokenizer.pad_token_id, input_ids), labels)
    else:
        return pad_sequence(tokenizer.pad_token_id, input_ids)

In [14]:
train_x, train_y = format_examples(train_examples)
valid_x, valid_y = format_examples(val_examples)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 392702/392702 [02:13<00:00, 2943.79it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19647/19647 [00:06<00:00, 3004.36it/s]


In [15]:
train_dict = {"train_x": train_x, "train_y": train_y, "valid_x": valid_x, "valid_y": valid_y}
pickle.dump(train_dict, open("../data/train_dict_formatted.pkl", "wb"))

### Creating Entailment dataset

In [5]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
data_dict = pickle.load(open("../data/data_dict_raw.pkl", "rb"))

exaple_pairs = {}

for split in ["training", "validation", "validation-zhihu"]:
    exaple_pairs[split] = []
    for k, v in data_dict[split].items():
        examples = [[v["sent"], i, 1] for i in v["opts"]] + \
                [[v["sent"], i, 0] for i in v["adverse_hard_opts"] + v["adverse_easy_opts"]]
        exaple_pairs[split].extend(examples)

len(exaple_pairs["training"]), len(exaple_pairs["validation"]), len(exaple_pairs["validation-zhihu"])        

(187058, 65900, 2254)

In [8]:
train_x, train_y = format_examples(exaple_pairs["training"])
valid_x, valid_y = format_examples(exaple_pairs["validation"])
valid_zhihu_x, valid_zhihu_y = format_examples(exaple_pairs["validation-zhihu"])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 187058/187058 [01:11<00:00, 2602.82it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65900/65900 [00:25<00:00, 2608.82it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2254/2254 [00:00<00:00, 2458.92it/s]


In [10]:
train_dict = {"train_x": train_x, "train_y": train_y, 
              "valid_x": valid_x, "valid_y": valid_y, 
              "valid_zhihu_x": valid_zhihu_x, "valid_zhihu_y": valid_zhihu_y}
pickle.dump(train_dict, open("../data/train_dict_formatted_value_dataset.pkl", "wb"))

### Predict on Test Set

In [11]:
model_name = "roberta_values_finetuning.pt"
device = torch.device("cuda:{}".format(0)) if torch.cuda.is_available() else "cpu"
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta = RobertaModel.from_pretrained("roberta-base")
model = EntailModel(roberta, tokenizer).to(device)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
state_dict = torch.load(model_name)
model.load_state_dict(state_dict)
model.eval()
print("Model loaded!")

Model loaded!


In [21]:
split = "test"
arg_df = pd.read_csv("../data/arguments-"+split+".tsv", sep="\t")
arg_df.shape

(1896, 4)

In [22]:
data_dict_raw = pickle.load(open("../data/data_dict_raw.pkl", "rb"))
len(data_dict_raw)

2

In [23]:
all_options = [j for k,v in {**data_dict_raw["training"], **data_dict_raw["validation"]}.items() 
               for j in v["opts"]+v["adverse_hard_opts"] + v["adverse_easy_opts"]]
all_options = list(set(all_options))
len(all_options)

218

In [24]:
import math

batch_size = 32
all_res = {}
for ix, row in tqdm(arg_df.iterrows()):
    stance = " against. " if row["Stance"] == "against" else " in favor of. "
    sent = row["Premise"] + stance + row["Conclusion"]
    pairs = [[sent, o] for o in all_options]
    test_x = torch.tensor(format_examples(pairs)).to(device)
    st = 0
    tmp_res = []
    for i in range(math.ceil(test_x.shape[0]/batch_size)):
        with torch.no_grad():
            output_lst = torch.sigmoid(model(input_ids=test_x[st: st+batch_size, :])).squeeze(-1).tolist()
            tmp_res.extend(output_lst)
            st += batch_size
    assert len(tmp_res) == len(all_options)
    all_res[row["Argument ID"]] = tmp_res

pickle.dump(all_res, open("../data/"+split+"_prediction_logit_dict_v1.pkl", "wb"))

0it [00:00, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 218/218 [00:00<00:00, 2311.24it/s][A
1it [00:00,  4.46it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 218/218 [00:00<00:00, 2568.78it/s][A
2it [00:00,  4.51it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 218/218 [00:00<00:00, 3155.72it/s][A
3it [00:00,  4.97it/s]
100%|███████████████████████████████████████████████████████████████████████████████

In [25]:
len(all_res)

1896

In [28]:
all_option_labels = []
l1_l0_map = {k2:k for k,v in val_cats.items() for k2 in v.keys()}

for i in all_options:
    l1, sent = i.split("by", 1)
    l1, sent = l1.strip(), sent.strip()
    all_option_labels.append(l1_l0_map[l1])

In [29]:
res_lbl_dct = {}
for k,v in all_res.items():
    arr = np.asarray(v)
    idx_res_5, idx_res_8, idx_res_9 = np.where(arr >= 0.5)[0].tolist(), np.where(arr >= 0.8)[0].tolist(), \
                np.where(arr >= 0.9)[0].tolist()
    res_5 = list(set([all_option_labels[i] for i in idx_res_5]))
    res_8 = list(set([all_option_labels[i] for i in idx_res_8]))
    res_9 = list(set([all_option_labels[i] for i in idx_res_9]))
    res_lbl_dct[k] = {"thresh_5": res_5, "thresh_8": res_8, "thresh_9": res_9} 

pickle.dump(res_lbl_dct, open("../data/"+split+"_prediction_label_dict_v1.pkl", "wb"))    

In [30]:
len(res_lbl_dct)

1896

 ### Submission Formatting

In [4]:
split = "test"

In [5]:
res_lbl_dct = pickle.load(open("../data/"+split+"_prediction_label_dict_v1.pkl", "rb"))

In [6]:
col_names = list(pd.read_csv("../data/labels-validation.tsv", sep="\t").columns)

In [11]:
op_lst = []
for arg_id, v in res_lbl_dct.items():
    t_lbl = [0] * len(col_names[1:])
    for ix, i in enumerate(col_names[1:]):
        if i in v["thresh_9"]:
            t_lbl[ix] = 1
    op_lst.append([arg_id] + t_lbl)
op_df = pd.DataFrame(op_lst, columns=col_names)
op_df.shape

(1896, 21)

In [13]:
op_df.to_csv("./"+split+"_entailment_run3.tsv", sep="\t", index=False)