In [12]:
import torch.nn as nn
from datasets import load_multitask_data
import bert
from config import PretrainedConfig
import torch
from datasets import SentencePairDataset
from tokenizer import BertTokenizer

In [52]:
# Get initialization parameters to validate the methods we create
class BertConfig(PretrainedConfig):
  model_type = "bert"


  def __init__(
    self,
    num_labels_sst ,
    num_labels_para ,
    num_labels_sts ,
    vocab_size=30522,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    pad_token_id=0,
    gradient_checkpointing=False,
    position_embedding_type="absolute",
    use_cache=True,
    
    
    
    name_or_path = "checkpoint",
    **kwargs
  ):
    super().__init__(pad_token_id=pad_token_id, **kwargs)

    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range
    self.layer_norm_eps = layer_norm_eps
    self.gradient_checkpointing = gradient_checkpointing
    self.position_embedding_type = position_embedding_type
    self.use_cache = use_cache
    self.name_or_path = name_or_path
    # added by hasan for multitask
    self.num_labels_sst = num_labels_sst
    self.num_labels_para = num_labels_para
    self.num_labels_sts = num_labels_sts
    

In [14]:
import time, random, numpy as np, argparse, sys, re, os
from types import SimpleNamespace

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from bert import BertModel
from optimizer import AdamW
from tqdm import tqdm

from datasets import SentenceClassificationDataset, SentencePairDataset, \
    load_multitask_data, load_multitask_test_data

from evaluation import model_eval_sst, test_model_multitask


TQDM_DISABLE=True

# fix the random seed
def seed_everything(seed=11711):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


BERT_HIDDEN_SIZE = 768
N_SENTIMENT_CLASSES = 5


class MultitaskBERT(nn.Module):
    '''
    This module should use BERT for 3 tasks:

    - Sentiment classification (predict_sentiment)
    - Paraphrase detection (predict_paraphrase)
    - Semantic Textual Similarity (predict_similarity)
    '''
    def __init__(self, config):
        super(MultitaskBERT, self).__init__()
        # You will want to add layers here to perform the downstream tasks.
        # Pretrain mode does not require updating bert paramters.
        self.num_labels_sst = config.num_labels_sst
        self.num_labels_para = config.num_labels_para
        self.num_labels_sts = config.num_labels_sts


        self.bert = BertModel.from_pretrained('bert-base-uncased')#, local_files_only=config.local_files_only)
        for param in self.bert.parameters():
            if config.option == 'pretrain':
                param.requires_grad = False
            elif config.option == 'finetune':
                param.requires_grad = True
        ### TODO
        ## sst - sentiment, para, sts - semantic text similarity    
        self.drop = torch.nn.Dropout(p=0.3)
        self.sst_classifier = torch.nn.Linear(self.bert.config.hidden_size, self.num_labels_sst)
        self.para_classifier = torch.nn.Linear(self.bert.config.hidden_size, self.num_labels_para)
        self.sts_classifier = torch.nn.Linear(self.bert.config.hidden_size, self.num_labels_sts)



    def forward(self, input_ids, attention_mask):
        'Takes a batch of sentences and produces embeddings for them.'
        # The final BERT embedding is the hidden state of [CLS] token (the first token)
        # Here, you can start by just returning the embeddings straight from BERT.
        # When thinking of improvements, you can later try modifying this
        # (e.g., by adding other layers).
        bert_out = self.bert(input_ids, attention_mask) 
        dropped = self.drop(bert_out['pooler_output'])
        out = self.linear(dropped)
        sentence_embeddings = out.last_hidden_state[:, 0, :]
        return sentence_embeddings


    def predict_sentiment(self, input_ids, attention_mask):
        '''Given a batch of sentences, outputs logits for classifying sentiment.
        There are 5 sentiment classes:
        (0 - negative, 1- somewhat negative, 2- neutral, 3- somewhat positive, 4- positive)
        Thus, your output should contain 5 logits for each sentence.
        '''
        ### TODO
        


    def predict_paraphrase(self,
                           input_ids_1, attention_mask_1,
                           input_ids_2, attention_mask_2):
        '''Given a batch of pairs of sentences, outputs a single logit for predicting whether they are paraphrases.
        Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
        during evaluation, and handled as a logit by the appropriate loss function.
        '''
        ### TODO
        raise NotImplementedError


    def predict_similarity(self,
                           input_ids_1, attention_mask_1,
                           input_ids_2, attention_mask_2):
        '''Given a batch of pairs of sentences, outputs a single logit corresponding to how similar they are.
        Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
        during evaluation, and handled as a logit by the appropriate loss function.
        '''
        ### TODO
        raise NotImplementedError




def save_model(model, optimizer, args, config, filepath):
    save_info = {
        'model': model.state_dict(),
        'optim': optimizer.state_dict(),
        'args': args,
        'model_config': config,
        'system_rng': random.getstate(),
        'numpy_rng': np.random.get_state(),
        'torch_rng': torch.random.get_rng_state(),
    }

    torch.save(save_info, filepath)
    print(f"save the model to {filepath}")


## Currently only trains on sst dataset
def train_multitask(args):
    device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
    # Load data
    # Create the data and its corresponding datasets and dataloader
    sst_train_data, num_labels,para_train_data, sts_train_data = load_multitask_data(args.sst_train,args.para_train,args.sts_train, split ='train')
    sst_dev_data, num_labels,para_dev_data, sts_dev_data = load_multitask_data(args.sst_dev,args.para_dev,args.sts_dev, split ='train')

    sst_train_data = SentenceClassificationDataset(sst_train_data, args)
    sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)

    sst_train_dataloader = DataLoader(sst_train_data, shuffle=True, batch_size=args.batch_size,
                                      collate_fn=sst_train_data.collate_fn)
    sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args.batch_size,
                                    collate_fn=sst_dev_data.collate_fn)

    # Init model
    config = {'hidden_dropout_prob': args.hidden_dropout_prob,
              'num_labels': num_labels,
              'hidden_size': 768,
              'data_dir': '.',
              'option': args.option,
              'local_files_only': args.local_files_only}

    config = SimpleNamespace(**config)

    model = MultitaskBERT(config)
    model = model.to(device)

    lr = args.lr
    optimizer = AdamW(model.parameters(), lr=lr)
    best_dev_acc = 0

    # Run for the specified number of epochs
    for epoch in range(args.epochs):
        model.train()
        train_loss = 0
        num_batches = 0
        for batch in tqdm(sst_train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE):
            b_ids, b_mask, b_labels = (batch['token_ids'],
                                       batch['attention_mask'], batch['labels'])

            b_ids = b_ids.to(device)
            b_mask = b_mask.to(device)
            b_labels = b_labels.to(device)

            optimizer.zero_grad()
            logits = model.predict_sentiment(b_ids, b_mask)
            loss = F.cross_entropy(logits, b_labels.view(-1), reduction='sum') / args.batch_size

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            num_batches += 1

        train_loss = train_loss / (num_batches)

        train_acc, train_f1, *_ = model_eval_sst(sst_train_dataloader, model, device)
        dev_acc, dev_f1, *_ = model_eval_sst(sst_dev_dataloader, model, device)

        if dev_acc > best_dev_acc:
            best_dev_acc = dev_acc
            save_model(model, optimizer, args, config, args.filepath)

        print(f"Epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}")



def test_model(args):
    with torch.no_grad():
        device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
        saved = torch.load(args.filepath)
        config = saved['model_config']

        model = MultitaskBERT(config)
        model.load_state_dict(saved['model'])
        model = model.to(device)
        print(f"Loaded model to test from {args.filepath}")

        test_model_multitask(args, model, device)


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sst_train", type=str, default="data/ids-sst-train.csv")
    parser.add_argument("--sst_dev", type=str, default="data/ids-sst-dev.csv")
    parser.add_argument("--sst_test", type=str, default="data/ids-sst-test-student.csv")

    parser.add_argument("--para_train", type=str, default="data/quora-train.csv")
    parser.add_argument("--para_dev", type=str, default="data/quora-dev.csv")
    parser.add_argument("--para_test", type=str, default="data/quora-test-student.csv")

    parser.add_argument("--sts_train", type=str, default="data/sts-train.csv")
    parser.add_argument("--sts_dev", type=str, default="data/sts-dev.csv")
    parser.add_argument("--sts_test", type=str, default="data/sts-test-student.csv")

    parser.add_argument("--seed", type=int, default=11711)
    parser.add_argument("--epochs", type=int, default=2)
    parser.add_argument("--option", type=str,
                        help='pretrain: the BERT parameters are frozen; finetune: BERT parameters are updated',
                        choices=('pretrain', 'finetune'), default="pretrain")
    parser.add_argument("--use_gpu", action='store_true')

    parser.add_argument("--sst_dev_out", type=str, default="predictions/sst-dev-output.csv")
    parser.add_argument("--sst_test_out", type=str, default="predictions/sst-test-output.csv")

    parser.add_argument("--para_dev_out", type=str, default="predictions/para-dev-output.csv")
    parser.add_argument("--para_test_out", type=str, default="predictions/para-test-output.csv")

    parser.add_argument("--sts_dev_out", type=str, default="predictions/sts-dev-output.csv")
    parser.add_argument("--sts_test_out", type=str, default="predictions/sts-test-output.csv")

    # hyper parameters
    parser.add_argument("--batch_size", help='sst: 64 can fit a 12GB GPU', type=int, default=64)
    parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
    parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
                        default=1e-3)
    parser.add_argument("--local_files_only", action='store_true')

    args = parser.parse_args()
    return args


usage: ipykernel_launcher.py [-h] [--sst_train SST_TRAIN] [--sst_dev SST_DEV]
                             [--sst_test SST_TEST] [--para_train PARA_TRAIN]
                             [--para_dev PARA_DEV] [--para_test PARA_TEST]
                             [--sts_train STS_TRAIN] [--sts_dev STS_DEV]
                             [--sts_test STS_TEST] [--seed SEED]
                             [--epochs EPOCHS] [--option {pretrain,finetune}]
                             [--use_gpu] [--sst_dev_out SST_DEV_OUT]
                             [--sst_test_out SST_TEST_OUT]
                             [--para_dev_out PARA_DEV_OUT]
                             [--para_test_out PARA_TEST_OUT]
                             [--sts_dev_out STS_DEV_OUT]
                             [--sts_test_out STS_TEST_OUT]
                             [--batch_size BATCH_SIZE]
                             [--hidden_dropout_prob HIDDEN_DROPOUT_PROB]
                             [--lr LR] [--local_files_only]
ip

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [49]:
# -----------------------------------------------------------------------
# fresh start 

class MultitaskBERT(nn.Module):
    '''
    This module should use BERT for 3 tasks:

    - Sentiment classification (predict_sentiment)
    - Paraphrase detection (predict_paraphrase)
    - Semantic Textual Similarity (predict_similarity)
    '''
    def __init__(self, config):
        super(MultitaskBERT, self).__init__()
        # You will want to add layers here to perform the downstream tasks.
        # Pretrain mode does not require updating bert paramters.
        #self.num_labels = config.num_labels

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.num_labels_sst = config.num_labels_sst
        self.num_labels_para = config.num_labels_para
        self.num_labels_sts = config.num_labels_sts


        
        for param in self.bert.parameters():
            if config.option == 'pretrain':
                param.requires_grad = False
            elif config.option == 'finetune':
                param.requires_grad = True
        ### TODO
        ## sst - sentiment, para, sts - semantic text similarity    
        self.drop = torch.nn.Dropout(p=0.3)
        self.sst_classifier = torch.nn.Linear(self.bert.config.hidden_size, self.num_labels_sst)
        self.para_classifier = torch.nn.Linear(self.bert.config.hidden_size, self.num_labels_para)
        self.sts_classifier = torch.nn.Linear(self.bert.config.hidden_size, self.num_labels_sts)



    def forward(self, input_ids, attention_mask):
        'Takes a batch of sentences and produces embeddings for them.'
        # The final BERT embedding is the hidden state of [CLS] token (the first token)
        # Here, you can start by just returning the embeddings straight from BERT.
        # When thinking of improvements, you can later try modifying this
        # (e.g., by adding other layers).
        bert_out = self.bert(input_ids, attention_mask) 
        dropped = self.drop(bert_out['pooler_output'])
        out = self.linear(dropped)
        sentence_embeddings = out.last_hidden_state[:, 0, :]
        return sentence_embeddings

    def predict_sentiment(self, input_ids, attention_mask):
        '''Given a batch of sentences, outputs logits for classifying sentiment.
        There are 5 sentiment classes:
        (0 - negative, 1- somewhat negative, 2- neutral, 3- somewhat positive, 4- positive)
        Thus, your output should contain 5 logits for each sentence.
        '''
   
       # Get the BERT embeddings
        bert_out = self.bert(input_ids, attention_mask)
        pooled_output = bert_out['pooler_output']
        dropped = self.drop(pooled_output)

        # Predict sentiment
        sentiment_logits = self.sst_classifier(dropped)

        return sentiment_logits
    
    def predict_paraphrase(self,
                           input_ids_1, attention_mask_1,
                           input_ids_2, attention_mask_2):
        '''Given a batch of pairs of sentences, outputs a single logit for predicting whether they are paraphrases.
        Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
        during evaluation, and handled as a logit by the appropriate loss function.
        '''
        ### TODO
            # Get the BERT embeddings for both sets of input sentences
        bert_out_1 = self.bert(input_ids_1, attention_mask_1)
        bert_out_2 = self.bert(input_ids_2, attention_mask_2)
        
        pooled_output_1 = bert_out_1['pooler_output']
        pooled_output_2 = bert_out_2['pooler_output']
        
        # Combine the pooled embeddings of both sentences using concatenation
        combined_pooled = torch.cat((pooled_output_1, pooled_output_2), dim=-1)
        dropped = self.drop(combined_pooled)
        
        # Predict paraphrase
        paraphrase_logits = self.para_classifier(dropped)
        
        return paraphrase_logits
     


    def predict_similarity(self,
                           input_ids_1, attention_mask_1,
                           input_ids_2, attention_mask_2):
        '''Given a batch of pairs of sentences, outputs a single logit corresponding to how similar they are.
        Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
        during evaluation, and handled as a logit by the appropriate loss function.
        '''
         # Get the BERT embeddings for both sets of input sentences
        bert_out_1 = self.bert(input_ids_1, attention_mask_1)
        bert_out_2 = self.bert(input_ids_2, attention_mask_2)
        
        pooled_output_1 = bert_out_1['pooler_output']
        pooled_output_2 = bert_out_2['pooler_output']
        
        # Combine the pooled embeddings of both sentences using concatenation
        combined_pooled = torch.cat((pooled_output_1, pooled_output_2), dim=-1)
        dropped = self.drop(combined_pooled)
        
        # Predict similarity
        similarity_logits = self.sts_classifier(dropped)
        
        return similarity_logits


In [50]:
from bert import BertModel
# bert_mod = bert.BertModel(config)
config = BertConfig(5,1,1)
config.option = 'pretrain'
model = MultitaskBERT(config)




In [38]:

def get_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("--sst_train", type=str, default="data/ids-sst-train.csv")
    parser.add_argument("--sst_dev", type=str, default="data/ids-sst-dev.csv")
    parser.add_argument("--sst_test", type=str, default="data/ids-sst-test-student.csv")

    parser.add_argument("--para_train", type=str, default="data/quora-train.csv")
    parser.add_argument("--para_dev", type=str, default="data/quora-dev.csv")
    parser.add_argument("--para_test", type=str, default="data/quora-test-student.csv")

    parser.add_argument("--sts_train", type=str, default="data/sts-train.csv")
    parser.add_argument("--sts_dev", type=str, default="data/sts-dev.csv")
    parser.add_argument("--sts_test", type=str, default="data/sts-test-student.csv")

    parser.add_argument("--seed", type=int, default=11711)
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--option", type=str,
                        help='pretrain: the BERT parameters are frozen; finetune: BERT parameters are updated',
                        choices=('pretrain', 'finetune'), default="pretrain")
    parser.add_argument("--use_gpu", action='store_true')

    parser.add_argument("--sst_dev_out", type=str, default="predictions/sst-dev-output.csv")
    parser.add_argument("--sst_test_out", type=str, default="predictions/sst-test-output.csv")

    parser.add_argument("--para_dev_out", type=str, default="predictions/para-dev-output.csv")
    parser.add_argument("--para_test_out", type=str, default="predictions/para-test-output.csv")

    parser.add_argument("--sts_dev_out", type=str, default="predictions/sts-dev-output.csv")
    parser.add_argument("--sts_test_out", type=str, default="predictions/sts-test-output.csv")

    # hyper parameters
    parser.add_argument("--batch_size", help='sst: 64 can fit a 12GB GPU', type=int, default=64)
    parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
    parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
                        default=1e-3)
    parser.add_argument("--local_files_only", action='store_true')
    
    try:
        get_ipython().__class__.__name__
    # No error means we're running on ipython
        args = parser.parse_args(args = []) # Reset args
    except NameError:
    # NameError means that we're running on terminal
        print('Running on terminal')
        args = parser.parse_args()
    return args

In [43]:

# from multitask_classifier import get_args, MultitaskBERT
args = get_args()
# print(args)
args.filepath = f'{args.option}-{args.epochs}-{args.lr}-multitask.pt' # save path
seed_everything(args.seed)  # fix the seed for reproducibility

In [44]:
# load the data set 


 
device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
# Load data
# Create the data and its corresponding datasets and dataloader
sst_train_data, num_labels,para_train_data, sts_train_data = load_multitask_data(args.sst_train,args.para_train,args.sts_train, split ='train')
sst_dev_data, num_labels,para_dev_data, sts_dev_data = load_multitask_data(args.sst_dev,args.para_dev,args.sts_dev, split ='train')

#Sentiment analysis
sst_train_data = SentenceClassificationDataset(sst_train_data, args)
sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)

sst_train_dataloader = DataLoader(sst_train_data, shuffle=True, batch_size=args.batch_size,
                                    collate_fn=sst_train_data.collate_fn)
sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args.batch_size,
                                collate_fn=sst_dev_data.collate_fn)

#Paraphrasing
paraphrase_train_data = SentencePairDataset(para_train_data, args, isRegression =False)
paraphrase_dev_data = SentencePairDataset(para_dev_data, args, isRegression =False)

paraphrase_train_dataloader = DataLoader(paraphrase_train_data, shuffle=True, batch_size=args.batch_size,
                                collate_fn=paraphrase_train_data.collate_fn)
paraphrase_dev_dataloader = DataLoader(paraphrase_dev_data, shuffle=True, batch_size=args.batch_size,
                                collate_fn=paraphrase_dev_data.collate_fn)

#sts
sts_train_data = SentencePairDataset(sts_train_data, args, isRegression =True)
sts_dev_data = SentencePairDataset(sts_dev_data, args, isRegression =True)

sts_train_dataloader = DataLoader(sts_train_data, shuffle=True, batch_size=256,
                                collate_fn=sts_train_data.collate_fn)
sts_dev_dataloader = DataLoader(sts_dev_data, shuffle=True, batch_size=256,
                            collate_fn=sts_dev_data.collate_fn)



Loaded 8544 train examples from data/ids-sst-train.csv
Loaded 141498 train examples from data/quora-train.csv
Loaded 6040 train examples from data/sts-train.csv
Loaded 1101 train examples from data/ids-sst-dev.csv
Loaded 20212 train examples from data/quora-dev.csv
Loaded 863 train examples from data/sts-dev.csv


In [54]:
# initialize the model parameters 
# config = {'hidden_dropout_prob': args.hidden_dropout_prob,
#             'num_labels': num_labels,
#             'hidden_size': 768,
#             'data_dir': '.',
#             'option': args.option,
#             'local_files_only': args.local_files_only}

# config = SimpleNamespace(**config)
config = BertConfig(5,1,1)
config.option = 'pretrain'

model = MultitaskBERT(config)
model = model.to(device)

lr = args.lr
optimizer = AdamW(model.parameters(), lr=lr)
best_dev_acc = 0


In [61]:
# train the model with the dataset
# Run for the specified number of epochs
for epoch in range(args.epochs ):
    model.train()
    train_loss = 0
    num_batches = 0
    for batch in tqdm(sst_train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE):
        b_ids, b_mask, b_labels = (batch['token_ids'],
                                    batch['attention_mask'], batch['labels'])

        b_ids = b_ids.to(device)
        b_mask = b_mask.to(device)
        b_labels = b_labels.to(device)

        optimizer.zero_grad()
        logits = model.predict_sentiment(b_ids, b_mask)
        loss = F.cross_entropy(logits, b_labels.view(-1), reduction='sum') / args.batch_size

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        num_batches += 1

    train_loss = train_loss / (num_batches)

    train_acc, train_f1, *_ = model_eval_sst(sst_train_dataloader, model, device)
    dev_acc, dev_f1, *_ = model_eval_sst(sst_dev_dataloader, model, device)


    ## Adding multitask evaluation
    #train
    (train_paraphrase_accuracy, train_para_y_pred, train_para_sent_ids,
      train_sentiment_accuracy,train_sst_y_pred, train_sst_sent_ids,
        train_sts_corr, train_sts_y_pred, train_sts_sent_ids) = model_eval_multitask(sst_train_dataloader,
                                                                  paraphrase_train_dataloader,sts_train_dataloader,model, model.device  )
    
    #dev
    (dev_paraphrase_accuracy, dev_para_y_pred, dev_para_sent_ids,
      dev_sentiment_accuracy,dev_sst_y_pred, dev_sst_sent_ids,
        dev_sts_corr, dev_sts_y_pred, dev__sent_ids) = model_eval_multitask(sst_dev_dataloader,
                                                                  paraphrase_dev_dataloader,sts_dev_dataloader,model, model.device  )
    
    #We have to weight or average the three sores to save the best model.
    # In the diven code only sst is used
    if dev_acc > best_dev_acc: 
      best_dev_acc = dev_acc
      save_model(model, optimizer, args, config, args.filepath)

    print(f"Epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}")



KeyboardInterrupt: 

In [None]:
# evaluate the model with evaluation dataset  

In [None]:
# test the model the test set