# necessary libraries

In [1]:
#!pip install wandb -q
#!wandb login
!pip install pyyaml==5.4.1
!pip install datasets -q
!pip install transformers[sentencepiece] -q
!pip install sentence-transformers -q

from datasets import load_dataset, load_metric
from pprint import pprint
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch import nn, Tensor
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import gc
import glob
import joblib
import json
import numpy as np
import os
import pandas as pd
import random
import re
import torch
import ast
import warnings
from google.colab import drive
drive.mount('/content/drive')
warnings.filterwarnings("ignore")
gc.enable()

try:
    from sentence_transformers import models, SentenceTransformer, losses, InputExample
    from transformers import AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig, AutoTokenizer, EarlyStoppingCallback
    import sentence_transformers
    import transformers
    transformers.logging.set_verbosity_error()
    import wandb
except:
    pass

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyyaml==5.4.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 5.0 MB/s 
[?25hInstalling collected packages: pyyaml
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed pyyaml-5.4.1
[K     |████████████████████████████████| 346 kB 5.1 MB/s 
[K     |████████████████████████████████| 140 kB 88.3 MB/s 
[K     |████████████████████████████████| 212 kB 97.6 MB/s 
[K     |████████████████████████████████| 86 kB 6.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 76.8 MB/s 
[K     |████████████████████████████████| 86 kB 5.8 MB/s 
[K     |████████████████████████████████| 127 kB 77.9 MB/s 
[K     |████████████████████████████████| 94 kB 3.8 MB/s 
[K     |████████████████████████████

# building datasets

In [None]:
class DataOps:
    def __init__(self):
        self.label_map = {
            0: "entailment",
            1: "neutral",
            2: "contradiction"
        }

        self.dataset_metadata = {"stage2_datasets": ["anli",
                                                      "mnli",
                                                      "FEVER",
                                                      "conjNLI",
                                                      "EQUATE"],
                                 
                                 "stage1_dataset": ["mrpc",
                                                    "qnli",
                                                    "rte",
                                                    "stsb",
                                                    "wnli",
                                                    "paws",
                                                    "swag",
                                                    "qqp",
                                                    "art"],
                                 
                                 "label_map": self.label_map}


        self.mapper = {"premise": "sentence1", 
                       "hypothesis": "sentence2",
                       "label": "label",
                       "sentence_A": "sentence1",
                       "sentence_B": "sentence2",
                       "sentence1": "sentence1",
                       "sentence2": "sentence2",
                       "question": "sentence1",
                       "sentence": "sentence2",
                       "question1": "sentence1",
                       "question2": "sentence2"}


        self.final_col_names = ["sentence1", "sentence2", "label"]
        self.stage1_train_df = pd.DataFrame(columns=self.final_col_names)
        self.stage2_train_df = pd.DataFrame(columns=self.final_col_names)


    def get_anli_data(self):
        """helper for getting ANLI data (0-1-2)"""
        d = {}
        dataset = load_dataset('anli')
        
        # train data
        for feature in ["premise", "hypothesis", "label"]:
            data = []

            for split_type in ["train"]:
                for i in range(1, 4):
                    data += dataset[split_type + "_r" + str(i)][feature]
            
            d[self.mapper[feature]] = data
        
        self.stage2_train_df = pd.concat([self.stage2_train_df, 
                                          pd.DataFrame(d, columns=["sentence1", "sentence2", "label"])],
                                         ignore_index=True)

        # val data
        for feature in ["premise", "hypothesis", "label"]:
            data = []

            for split_type in ["dev"]:
                for i in range(1, 4):
                    data += dataset[split_type + "_r" + str(i)][feature]
            
            d[self.mapper[feature]] = data

        self.stage2_val_df = pd.DataFrame(d, columns=["sentence1", "sentence2", "label"])

         # test data
        premise_l = []
        hypo_l = []
        label_l = []

        for elem in ["test_r1", "test_r2", "test_r3"]:
            premise_l += zip(dataset[elem]["premise"], 
                             [elem for _ in range(len(dataset[elem]["premise"]))])
            
            hypo_l += dataset[elem]["hypothesis"]
            label_l += dataset[elem]["label"]


        self.stage2_test_df = pd.DataFrame(premise_l, columns= ["premise", "round"])
        self.stage2_test_df["hypothesis"] = hypo_l
        self.stage2_test_df["label"] = label_l
        self.stage2_test_df.rename(columns={"premise": "sentence1", "hypothesis": "sentence2"}, 
                                 inplace=True) 

    def get_mnli_data(self):
        """helper for getting MNLI data (0-1-2)"""
        d = {}
        dataset = load_dataset('glue', 'mnli')

        for feature in ["premise", "hypothesis", "label"]:
            data = []

            for split_type in ["train", "validation_matched", "validation_mismatched"]:
                data += dataset[split_type][feature]
            
            d[self.mapper[feature]] = data

        return pd.DataFrame(d, columns=["sentence1", "sentence2", "label"])


    def get_fever_data(self):
        """helper for getting FEVER-NLI dataset"""

        with open('/content/train_fitems.jsonl', 'r') as json_file:
            json_list = list(json_file)

        results = [json.loads(json_str) for json_str in json_list]
        fever = pd.DataFrame([(elem["context"], elem["query"], elem["label"]) for elem in results])
        fever.columns = ["sentence1", "sentence2", "label"]
        fever["label"] = fever["label"].map({'NOT ENOUGH INFO': 1, 'REFUTES': 2, 'SUPPORTS': 0})

        return fever

                                            

    def get_glue_data(self, dataset_name, feature_list, split_types):
        """helper for getting GLUE data"""
        d = {}
        dataset = load_dataset('glue', dataset_name)

        for feature in feature_list:
            data = []

            for split_type in split_types:
                data += dataset[split_type][feature]
            
            d[self.mapper[feature]] = data

        data = pd.DataFrame(d, columns=["sentence1", "sentence2", "label"])
        
        if dataset_name == "rte":
            data["label"] = [1 if elem == 0 else 0 for elem in data["label"]]

        elif dataset_name == "stsb":
            data["label"] = [0 if label <= 3 else 1 for label in data["label"]]

        return data

    
    def get_paws_data(self):
        """helper for loading PAWS data (0-1)"""
        d = {}
        dataset = load_dataset('paws', "labeled_final")

        for feature in ['sentence1', 'sentence2', 'label']:
            data = []

            for split_type in ["train", "validation", "test"]:
                data += dataset[split_type][feature]
            
            d[feature] = data

        return pd.DataFrame(d, columns=d.keys())                         


    def get_swag_data(self):
        """helper for loading swag data (0-1)"""
        d = {}
        dataset = load_dataset('swag')

        sent1_l = []
        sent2_l = []
        
        for row in dataset["train"]:
            sent1_l.append(row["sent1"])
            sent2_l.append(row["sent2"] + " " + row["ending" + str(row["label"])])

        d["sentence1"] = sent1_l
        d["sentence2"] = sent2_l

        swag_df = pd.DataFrame(d, columns=d.keys())
        swag_df["label"] = 1

        return swag_df


    def get_art_data(self):
        d = {}
        dataset = load_dataset('art')

        sent1_l = []
        sent2_l = []
        
        for row in dataset["train"]:
            sent1_l.append(row["observation_1"] + " " + row["observation_2"])
            sent2_l.append(row["hypothesis_{}".format(row["label"])])


        for row in dataset["validation"]:
            sent1_l.append(row["observation_1"] + " " + row["observation_2"])
            sent2_l.append(row["hypothesis_{}".format(row["label"])])

            
        d["sentence1"] = sent1_l
        d["sentence2"] = sent2_l

        art_df = pd.DataFrame(d, columns=d.keys())
        art_df["label"] = 1

        return art_df


    def get_conjNLI_data(self):
        """helper for getting conjNLI dataset"""
        merged_df = pd.concat([pd.read_csv("/content/adversarial_train_15k.tsv.txt", sep="\t").dropna(), 
                          pd.read_csv("/content/conj_dev.tsv.txt", sep="\t").dropna()], ignore_index=True)
        
        merged_df.columns = ["sentence1", "sentence2", "label"]        
        merged_df["label"] = merged_df["label"].map({'neutral': 1, 'contradiction': 2, 'entailment': 0})
        return merged_df


    def get_EQUATE_data(self):
        files = ["/content/AWPNLI.jsonl", "/content/NewsNLI.jsonl", "/content/RedditNLI.jsonl", "/content/RTE_Quant.jsonl", "/content/StressTest.jsonl"]
        result_df = pd.DataFrame()

        for f in tqdm(files):
            try:
                with open(f, 'r') as json_file:
                    json_list = list(json_file)

                results = [json.loads(json_str) for json_str in json_list]
                result_df = pd.concat([result_df,
                                      pd.DataFrame([(elem["sentence1"], elem["sentence2"], elem["gold_label"]) for elem in results])], ignore_index=True)
                
            except Exception as e:
                print(f)           
                print(e)

        result_df.columns = ["sentence1", "sentence2", "label"]
        result_df["label"] = result_df["label"].map({'neutral': 1, 'contradiction': 2, 'entailment': 0})
        return result_df
        

    def create_dataset(self):
        """main routine for this class."""
        # STAGE 1
        stage1_dict = {
            'mrpc': (['sentence1', 'sentence2', 'label'], ["train", "validation", "test"]), 
            "qnli": (['question', 'sentence', 'label'], ["train", "validation"]),
            "rte": (['sentence1', 'sentence2', 'label'], ["train", "validation"]),     
            'stsb': (['sentence1', 'sentence2', 'label'], ["train", "validation"]), 
            'wnli': (['sentence1', 'sentence2', 'label'], ["train", "validation"]),
            "qqp": (['question1', 'question2', 'label'], ["train", "validation"]),
        }

        for key, val in stage1_dict.items():
            self.stage1_train_df = pd.concat([self.stage1_train_df,
                                              self.get_glue_data(key, val[0], val[1])],
                                              ignore_index=True)
        
        for dataset in tqdm([self.get_paws_data(),
                        self.get_swag_data(),
                        self.get_art_data()]):        

            self.stage1_train_df = pd.concat([self.stage1_train_df, dataset],
                                             ignore_index=True)

        print("Stage 1 done.")

        self.get_anli_data()
         
        for dataset in tqdm([self.get_mnli_data(), self.get_fever_data(), self.get_EQUATE_data(), self.get_conjNLI_data()]):
            self.stage2_train_df = pd.concat([self.stage2_train_df, dataset],
                                              ignore_index=True)                                         

        print("Stage 2 done.")

        self.stage1_train_df["label"] = self.stage1_train_df["label"].astype(int)
        self.stage2_train_df["label"] = self.stage2_train_df["label"].astype(int)
        
        # save results
        self.stage1_train_df.drop_duplicates().dropna(how="any", axis=0).reset_index(drop=True)
        self.stage2_train_df.drop_duplicates().dropna(how="any", axis=0).reset_index(drop=True)

        self.stage1_train_df.to_parquet("/content/drive/MyDrive/deep_learning_project/data/stage1_train.parquet", index=False)
        self.stage2_train_df.to_parquet("/content/drive/MyDrive/deep_learning_project/data/stage2_train.parquet", index=False)
        self.stage2_val_df.to_parquet("/content/drive/MyDrive/deep_learning_project/data/stage2_val.parquet", index=False)
        self.stage2_test_df.to_parquet("/content/drive/MyDrive/deep_learning_project/data/stage2_test.parquet", index=False)

        self.dataset_metadata["concatenated"] = {}

        for key, data in {
            "stage1_train": self.stage1_train_df,
            "stage2_train": self.stage2_train_df,
            "val": self.stage2_val_df,
            "test": self.stage2_test_df}.items():

            self.dataset_metadata["concatenated"][key] = {
                "data_len": len(data),
                "avg_text_len": np.mean([len(elem.split()) for elem in data["sentence1"]] + [len(elem.split()) for elem in data["sentence2"]]),
                "label_ratio": data["label"].value_counts(normalize=True)
            }
        
        pprint(dataops.dataset_metadata)

        joblib.dump(self.dataset_metadata, "/content/drive/MyDrive/deep_learning_project/data/final_metadata.joblib")
        files.download("/content/drive/MyDrive/deep_learning_project/data/final_metadata.joblib")
        print("Done.")

    
dataops = DataOps()
dataops.create_dataset()

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset paws (/root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338)


  0%|          | 0/3 [00:00<?, ?it/s]

No config specified, defaulting to: swag/regular
Reusing dataset swag (/root/.cache/huggingface/datasets/swag/regular/0.0.0/9640de08cdba6a1469ed3834fcab4b8ad8e38caf5d1ba5e7436d8b1fd067ad4c)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset art (/root/.cache/huggingface/datasets/art/anli/0.1.0/e4b20acfcea873d587a87e817a63c02ce080bce28cd4c322dbd476fd07286b49)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset anli (/root/.cache/huggingface/datasets/anli/plain_text/0.1.0/aabce88453b06dff21c201855ea83283bab0390bff746deadb30b65695755c0b)


Stage 1 done.


  0%|          | 0/9 [00:00<?, ?it/s]

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Stage 2 done.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Done.


# model training framework

In [None]:
def set_seed(seed):
    """Set all seeds to make results reproducible"""
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.required_cols = ['input_ids', 'token_type_ids', 'attention_mask']

    def __getitem__(self, idx):
        item = {col: torch.tensor(self.encodings.loc[idx, col][0]) for col in self.encodings.columns}
        item['labels'] = torch.tensor(self.labels.loc[idx])
        return item

    def __len__(self):
        return len(self.labels)


def encode(examples):
    return tokenizer(examples["sentence1"],
                     examples["sentence2"],
                     padding='max_length',  # Pad to max_length
                     truncation=True,  # Truncate to max_length
                     max_length=MAX_LENGTH,
                     return_tensors='pt')


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = load_metric("accuracy")
    f1 = load_metric("f1")
    acc = acc.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1.compute(predictions=predictions, references=labels, average="macro")["f1"]

    return {"accuracy": acc, "f1": f1}


def delete_path(path):
    import shutil
    shutil.rmtree(path)    

    

class ModelTrainer:
    def __init__(self,
                 experiment_params,
                 run_name):
        self.params = experiment_params
        self.run_name = run_name

        if not os.path.exists(self.params["stage1"]["model_path"]):
            os.makedirs(self.params["stage1"]["model_path"])

        if not os.path.exists(self.params["stage2"]["model_path"]):
            os.makedirs(self.params["stage2"]["model_path"])

        self.label_map = {
            0: "entailment",
            1: "neutral",
            2: "contradiction"
        }

        self.seed = 2022
        set_seed(self.seed)


    def get_loss(self):
        """helper for selecting appropriate loss function"""

        if self.params["stage1"]["loss"] == "contrastive":
            loss = losses.ContrastiveLoss(self.model)

        elif self.params["stage1"]["loss"] == "online_contrastive":
            loss = losses.OnlineContrastiveLoss(self.model)

        elif self.params["stage1"]["loss"] == "mnr":
            loss = losses.MultipleNegativesRankingLoss(self.model)

        elif self.params["stage1"]["loss"] == "cosine":
            loss = losses.CosineSimilarityLoss(self.model)

        return loss


    def load_data(self, data_path, is_nli=False, round=None):
        """helper for loading data for training & testing"""

        if not is_nli:
            df = pd.read_parquet(data_path)
            samples = []

            for i in tqdm(range(len(df)), desc="Preparing data..."):
                if self.params["stage1"]["loss"] != "mnr":
                    samples.append(InputExample(texts=[df.loc[i, "sentence1"],
                                                       df.loc[i, "sentence2"]],
                                                label=df.loc[i, "label"]))
                else:
                    samples.append(InputExample(texts=[df.loc[i, "sentence1"],
                                                       df.loc[i, "sentence2"]]))

            if self.params["stage1"]["loss"] != "mnr":
                loader = DataLoader(samples,
                                    shuffle=True,
                                    batch_size=self.params["stage1"]["batch_size"])

            else:
                loader = sentence_transformers.datasets.NoDuplicatesDataLoader(samples,
                                                                               batch_size=self.params["stage1"]["batch_size"])

        else:
            df = pd.DataFrame(load_dataset('parquet', data_files=data_path, split='train').map(encode))
            
            if round is not None:
                df = df[df["round"] == round]
                df.reset_index(drop=True, inplace=True)

            loader = CustomDataset(df[['input_ids', 'token_type_ids', 'attention_mask']], df["label"])

        return loader


    def train_model(self,
                    mode="train",
                    do_stage1=False,
                    do_stage2=True,
                    use_wandb=False):
      
        """main routine for running training session"""

        if use_wandb:
            wandb.init(project=self.params["project_name"],
                        name=self.params["run_name"],
                        entity="deep_learning_project_597_bogazici",
                        notes = self.params["run_notes"])
            
        # Part 1 - fine tuning
        
        if do_stage1:
            self.loader = self.load_data(self.params["stage1"]["data_path"])
            deberta = models.Transformer("microsoft/deberta-v3-base", 
                                         max_seq_length=MAX_LENGTH)

            pooler = models.Pooling(
                deberta.get_word_embedding_dimension(),
                pooling_mode_cls_token=True
            ) if self.params["stage1"]["pooling_mode"] == "cls_token" else models.Pooling(
                deberta.get_word_embedding_dimension(),
                pooling_mode_mean_tokens=True
            )

            self.model = SentenceTransformer(modules=[deberta, pooler])
            self.stage1_loss = self.get_loss()

            self.model.fit(
                train_objectives=[(self.loader, self.stage1_loss)],
                epochs=self.params["stage1"]["n_epochs"],
                warmup_steps=int(len(self.loader) * self.params["stage1"]["n_epochs"] * 0.1),
                output_path=self.params["stage1"]["model_path"],
                show_progress_bar=True,
                use_amp=True,
                weight_decay=self.params["stage1"]["weight_decay"]
            )

            print("{} loss fine-tuning done.".format(self.params["stage1"]["loss"]))

            del self.model, self.stage1_loss, self.loader
            gc.collect()
            torch.cuda.empty_cache()

        # Part 2 - NLI fine tuning

        if do_stage2:
            # NLI Softmax Loss read data
            self.train_dataset = self.load_data(self.params["stage2"]["train_data_path"],
                                                is_nli=True)

            self.eval_dataset = self.load_data(self.params["stage2"]["val_data_path"],
                                                is_nli=True)

            # load model
            if do_stage1:
                model_path = self.params["stage1"]["model_name"]
            else:
                model_path = self.params["stage2"]["model_name"]

            config = AutoConfig.from_pretrained(
                model_path,
                num_labels=3
            )

            tokenizer = AutoTokenizer.from_pretrained(
                model_path,
            )

            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_path,
                config=config,
            )

            training_args = TrainingArguments(s
                output_dir=self.params["stage2"]["model_path"],
                num_train_epochs=self.params["stage2"]["n_epochs"],
                per_device_train_batch_size=self.params["stage2"]["batch_size"],
                per_device_eval_batch_size=self.params["stage2"]["batch_size"],
                learning_rate=self.params["stage2"]["learning_rate"],
                warmup_ratio=self.params["stage2"]["warmup_ratio"],
                weight_decay=self.params["stage2"]["weight_decay"],
                logging_steps=self.params["stage2"]["eval_steps"],
                report_to="wandb",
                evaluation_strategy="steps",
                eval_steps=self.params["stage2"]["eval_steps"],
                run_name=self.params["run_name"],
                disable_tqdm=False,
                fp16=True,
                seed=self.seed,
                log_level='error',
                load_best_model_at_end=True,
                save_total_limit=self.params["stage2"]["save_total_limit"],
                save_strategy="steps",
                save_steps=self.params["stage2"]["eval_steps"],
                callbacks=[EarlyStoppingCallback(early_stopping_patience=1, 
                                                 early_stopping_threshold = 0.5)]
            )

            trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=self.train_dataset,
                eval_dataset=self.eval_dataset,
                compute_metrics=compute_metrics,
            )

            trainer.train()
            torch.cuda.empty_cache()
            print("NLI training done.")

            if not do_stage1:
                del self.params["stage1"]
        
        if use_wandb:
            wandb.finish()
        
        # save experiment configuration

        with open(self.params["config_path"], 'w') as f:
            for key, value in self.params.items():
                f.write('%s:%s\n' % (key, value))


# environment & wandb variables
BASE_PATH = "/content/drive/MyDrive/deep_learning_project/"
project_name = "softmax"
run_name = "softmax_exp4"
run_notes = "Softmax Loss, deberta-v3-large, 3 epochs"
stage1_loss = ""
MAX_LENGTH = 128
model_name = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# '/content/drive/MyDrive/deep_learning_project/online_contrastive_exp2/online_contrastive_loss_model'


# experiment parameters
params = {
    "project_name": project_name,
    "run_name": run_name,
    "run_notes": run_notes,
    "config_path":os.path.join(BASE_PATH, run_name, "experiment_config.txt"),
    "max_len": MAX_LENGTH,
    "dataset_metadata": "/content/drive/MyDrive/deep_learning_project/data/final_metadata.joblib",
    
  "stage1": {
      "batch_size": 32,
      "n_epochs": 5,
      "learning_rate": 2e-5,
      "loss": stage1_loss,
      "data_path": os.path.join(BASE_PATH, "data/stage1_train.parquet"),
      "model_path": os.path.join(BASE_PATH, run_name, "{}_loss_model".format(stage1_loss)),
      "maxlen": MAX_LENGTH,
      "pooling_mode": "mean_token",
      "weight_decay": 0.05,
      "model_name": model_name
  },

  "stage2": {
      "batch_size": 64, #64
      "n_epochs": 10,
      "learning_rate": 2e-5,
      "train_data_path": os.path.join(BASE_PATH, "data/stage2_train.parquet"),
      "val_data_path": os.path.join(BASE_PATH, "data/stage2_val.parquet"),
      "test_data_path": os.path.join(BASE_PATH, "data/stage2_test.parquet"),
      "model_path": os.path.join(BASE_PATH, run_name, "nli_model"),
      "warmup_ratio": 0.1,
      "weight_decay": 0.05,
      "eval_steps": 15750,
      "save_total_limit": 1,
      "model_name": model_name
  }
}

# current step: Gradual unfreezing layers for stage2 (using stage1 model directly.)
# next step: Gradual unfreezing for both models

model_trainer = ModelTrainer(params, run_name)
model_trainer.train_model(do_stage1=False, do_stage2=True, use_wandb=True)

# model evaluation

In [5]:
from datasets import load_dataset, load_metric
from sentence_transformers import InputExample, models, SentenceTransformer, losses
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig, AutoTokenizer
import numpy as np
import pandas as pd
import torch
import transformers
transformers.logging.set_verbosity_error()

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.required_cols = ['input_ids', 'token_type_ids', 'attention_mask']

    def __getitem__(self, idx):
        item = {col: torch.tensor(self.encodings.loc[idx, col][0]) for col in self.encodings.columns}
        item['labels'] = torch.tensor(self.labels.loc[idx])
        return item

    def __len__(self):
        return len(self.labels)


def encode(examples):
    return tokenizer(examples["sentence1"],
                     examples["sentence2"],
                     padding='max_length',  # Pad to max_length
                     truncation=True,  # Truncate to max_length
                     max_length=100,
                     return_tensors='pt')


tokenizer = AutoTokenizer.from_pretrained(
    'microsoft/deberta-v3-base',
)

data_path = '/content/drive/MyDrive/deep_learning_project/data/nli_test.parquet'
test_dataset = load_dataset('parquet', data_files=data_path)
test_df = pd.DataFrame(test_dataset.map(encode)["train"])
test_dataset = CustomDataset(test_df[['input_ids', 'token_type_ids', 'attention_mask']], test_df["label"])
y_true = pd.read_parquet(data_path)

Using custom data configuration default-aba38aed9391bed1
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/default-aba38aed9391bed1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/parquet/default-aba38aed9391bed1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-1548699e7bf833f4.arrow


In [20]:
def evaluate_model(data_loader, model_path, tokenizer_name='microsoft/deberta-v3-base'):
    """helper for evaluating model"""
    # Load trained model
    #model_checkpoint = "/content/drive/MyDrive/deep_learning_project/baseline_model/checkpoint-26500"
    model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                               num_labels=3)
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # Define test trainer
    test_trainer = Trainer(model)
    
    # Make prediction
    raw_pred, _, _ = test_trainer.predict(data_loader)

    # Preprocess raw predictions
    y_pred = np.argmax(raw_pred, axis=1)
    y_true = pd.read_parquet(data_path)["label"].values
    print("Accuracy score: {}", round(accuracy_score(y_true, y_pred)), 3)
    print("F1 score: {}", round(f1_score(y_true, y_pred, average='macro'), 3))


evaluate_model(test_dataset, "/content/drive/MyDrive/deep_learning_project/online_contrastive_exp1/nli_model/checkpoint-60000")

loading configuration file /content/drive/MyDrive/deep_learning_project/online_contrastive_exp1/nli_model/checkpoint-60000/config.json
Model config DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/deep_learning_project/online_contrastive_exp1/nli_model/checkpoint-60000",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_

0.57375
0.5718619406402493
