In [1]:
import torch
from transformers import (
    AdamW,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoConfig
)

In [2]:
import sys
module_path = '/home/x1112436/git/sent-semantic-repo'
sys.path.append(module_path)

In [3]:
import os 
import numpy as np

In [4]:
from src.utils import set_seed
from src.trainer import SimcseTrainer
from src.dataset import DATASET_MAPPING_DICT
from src.utils import PreprocessorFactory 
from src.utils import get_model_argparse
from src.model import MODEL_MAPPING_DICT
from src.model import CONFIG_MAPPING_DICT
from src.logger import Experi_Logger
from config.nli_config import nli_parser_model_args

In [5]:
import logging

In [6]:
from skt.vault_utils import get_secrets
proxies = get_secrets('proxies')

In [7]:
os.environ['http_proxy'] = proxies['http']
os.environ['https_proxy'] = proxies['https']

In [8]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

# LOAD ARGS FOR EXPERI SETTING

In [10]:
args = nli_parser_model_args()

In [54]:
args.model_name = 'klue/roberta-large'

In [55]:
args.output_dir = f'/home/x1112436/result/faq/modelfile/{args.model_name}'
args.log_dir = f'/home/x1112436/result/faq/log/{args.model_name}'
args.experiments_path = f'/home/x1112436/result/faq/experiment/{args.model_name}/experiment.csv'

In [56]:
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.is_preprocessed = True
args.valid_first = False
args.data_type='triple'
args.loss= 'MultipleNegativesRankingLoss'
args.margin = 0.5
args.num_train_epochs = 20
args.model_max_len = 50
args.valid_first = True
args.val_loss_nm = 'TripletLoss'
args.patience_limit = 5

In [57]:
print(args.log_dir)

/home/x1112436/result/faq/log/klue/roberta-large


In [13]:
args.pretrained_model = '/home/x1112436/model_file/sent_roberta'

In [14]:
model = MODEL_MAPPING_DICT['sent_roberta'].from_pretrained(
    args.pretrained_model, **vars(args), 
)
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

# add token to vocab if needed

In [16]:
# added_tokens = [
#  'NUGU',
#  'wavve',
#  'All케어플러스3',
#  '컬러링',
#  'T플랜',
#  'baro',
#  '5G',
#  'SMS',
#  'T파이',
#  '티파이'
# ]

In [17]:
# add tokens to tokenizer and resize embeddings

In [18]:
#new_tokens = set(added_tokens) - set(tokenizer.vocab.keys())

In [19]:
#tokenizer.add_tokens(list(new_tokens), special_tokens=False)

In [20]:
#model.resize_token_embeddings(len(tokenizer))

In [21]:
# train_dataset = []
# with open('../data/train.csv', 'r') as f:
#     for data in f:
#         line = data.strip().split('\t')
#         train_dataset.append(line)

# Preprocess

In [22]:
preprocessor = PreprocessorFactory(data_type=args.data_type)

In [23]:
faq_data = preprocessor.preprocess(
                  data_path='../data/train.csv',
                  tokenizer=tokenizer, 
                  save_path=None,
                  header=False
)

In [24]:
faq_data[0]

NLIInput(sentence_a='티 링 서비스', sentence_b='T Ring', sentence_c='T All케어플러스3', a_input_ids=[0, 1819, 1035, 3838, 2], a_attention_mask=[1, 1, 1, 1, 1], b_input_ids=[0, 56, 54, 4586, 2], b_attention_mask=[1, 1, 1, 1, 1], c_input_ids=[0, 56, 17924, 18713, 11127, 2195, 2], c_attention_mask=[1, 1, 1, 1, 1, 1, 1], label=1180)

In [25]:
#num_classes = len(preprocessor.__idx2positive__)

In [26]:
train_dataset = DATASET_MAPPING_DICT['Unsup_simcse']
train_dataset = train_dataset(args=args, features=faq_data, max_length=args.model_max_len, tokenizer=tokenizer)

In [27]:
from src.dataset import RandomClassSampler, BalancedClassSampler, RandomClassBatchSampler

In [28]:
len(train_dataset)

357388

# validation set

In [29]:
val_dataset = []
with open('../data/val.csv', 'r') as f:
    for data in f:
        line = data.strip().split('\t')
        val_dataset.append(line)

In [30]:
#val_dataset[1:50]

In [31]:
preprocessor.reset_class_var()

In [32]:
faq_val_data = preprocessor.preprocess(
                  data_path='../data/val.csv',
                  tokenizer=tokenizer, 
                  save_path=None,
                  header=False
)

In [33]:
faq_val_data[0]

NLIInput(sentence_a='티 링 부가서비스', sentence_b='T Ring', sentence_c='T All케어플러스3', a_input_ids=[0, 1819, 1035, 8073, 28930, 2], a_attention_mask=[1, 1, 1, 1, 1, 1], b_input_ids=[0, 56, 54, 4586, 2], b_attention_mask=[1, 1, 1, 1, 1], c_input_ids=[0, 56, 17924, 18713, 11127, 2195, 2], c_attention_mask=[1, 1, 1, 1, 1, 1, 1], label=1157)

In [34]:
val_dataset = DATASET_MAPPING_DICT['Unsup_simcse']
val_dataset = val_dataset(args=args, features=faq_val_data, max_length=args.model_max_len, tokenizer=tokenizer)

In [35]:
val_dataset[0]

{'a_input_ids': tensor([    0,  1819,  1035,  8073, 28930,     2]),
 'a_attention_mask': tensor([1, 1, 1, 1, 1, 1]),
 'b_input_ids': tensor([   0,   56,   54, 4586,    2]),
 'b_attention_mask': tensor([1, 1, 1, 1, 1]),
 'c_input_ids': tensor([    0,    56, 17924, 18713, 11127,  2195,     2]),
 'c_attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
 'label': 1157}

In [36]:
#test_val_loader =val_dataset.loader(shuffle=False, batch_size=10)

In [37]:
#next(iter(test_val_loader))

# Dataloader test

In [38]:
# class_to_idx = {'class1': 0, 'class2': 1, ...} # Define your class to index mapping
batch_size = args.train_batch_size + 64
balanced_sampler = BalancedClassSampler(train_dataset, preprocessor.__positive2idx__, batch_size)
batch_sampler = RandomClassBatchSampler(balanced_sampler, batch_size=batch_size, drop_last=False)

In [39]:
dataloader = train_dataset.loader(batch_sampler = batch_sampler)

In [40]:
len(dataloader)

1862

In [41]:
#next(iter(dataloader))

# Trainer

In [51]:
trainer = SimcseTrainer(args=args, logger=logging)

In [52]:
trainer.model_setting(model_type=args.model_type, train_dataset=train_dataset, model=model, tokenizer=tokenizer, batch_sampler=batch_sampler )

loss: MultipleNegativesRankingLoss


In [53]:
trainer.train(model=model, tokenizer=tokenizer, train_dataset=train_dataset, val_dataset=val_dataset, model_type=args.model_type)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 2.035730988163276, epoch: 0


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

best valid metic: 0.22212980202627633


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 1.0233370154736026, epoch: 1


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.09968127292703907


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.5208712638904585, epoch: 2


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.07140459324988835


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.3640772386045264, epoch: 3


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.054419755777519827


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.2664978393872312, epoch: 4


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.04308287764653423


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.20441791652433025, epoch: 5


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.032110536730198085


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.1585703858063125, epoch: 6


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.02514042953503722


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.1286646660272307, epoch: 7


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.0204000240718981


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.1089316283736453, epoch: 8


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.017065119428806147


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.09631766520390575, epoch: 9


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.015457492826928227


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.08995313939632185, epoch: 10


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.013889469409510845


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.08124925036258346, epoch: 11


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.012036744966435264


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.07356875214200692, epoch: 12


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.010368171540629294


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.06860627242082717, epoch: 13


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.009197884975638206


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.06334943382807026, epoch: 14


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.008187288490073528


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.06022611199259358, epoch: 15


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.007722469537084008


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.059191632841607465, epoch: 16


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.007319211069151829


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.05760027043221381, epoch: 17


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.007025530764158763


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.05594709873324473, epoch: 18


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.006739934988458254


  0%|          | 0/1862 [00:00<?, ?it/s]

best_train_loss: 0.05473255221025656, epoch: 19


Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]

  self.hparams = self.hparams.append(self.writer, ignore_index=True)


best valid metic: 0.0064877562446163895


# TEST if val works or not before train

In [40]:
val_result, val_losses = trainer.validate_faq(val_dataset, epoch=0)

Evaluating:   0%|          | 0/106 [00:00<?, ?it/s]