In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW
import random
import matplotlib.pyplot as plt
from collections import OrderedDict
from scipy.spatial.distance import cosine
from sim_utils import load_examples, Inputexample, CustomTextDataset, freeze_layers, train, test
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, AutoModel, AutoTokenizer

os.environ['http_proxy'] = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [2]:
!nvidia-smi

Sun May  1 15:38:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:84:00.0 Off |                  N/A |
| 24%   27C    P8    11W / 250W |      3MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:85:00.0 Off |                  N/A |
| 22%   28C    P8     1W / 250W |   2304MiB / 11264MiB |      0%      Default |
|       

In [3]:
N = 10

data = []
labels = []

train_samples = []
train_labels = []

valid_samples = []
valid_labels = []

test_samples = []
test_labels = []

embed_dim = 768
batch_size = 16 
lr=2e-3  # you can adjust 
temp = 0.3  # you can adjust 
lamda = 0.01  # you can adjust  
skip_time = 0 # the number of time that yi not equal to yj in supervised contrastive loss equation 

data_names = ['CLINC150','BANKING77','HWU64'] 
model_names = ['roberta-base','bert-base-uncased']
shot_names = ['train_5','train_10']
lines = ['test acc']

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')

In [4]:
!nvidia-smi

Sun May  1 15:38:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:84:00.0 Off |                  N/A |
| 24%   27C    P8    11W / 250W |      3MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:85:00.0 Off |                  N/A |
| 22%   28C    P8     1W / 250W |   2304MiB / 11264MiB |      0%      Default |
|       

In [5]:
for shot_name in shot_names:
    for data_name in data_names:

        path_shot = f'../../../{data_name}/{shot_name}/'

        valid_path = f'../../../{data_name}/valid/'
        test_path = f'../../../{data_name}/test/'


        # load data
        train_samples = load_examples(path_shot)
        valid_samples = load_examples(valid_path)
        test_samples = load_examples(test_path)


        print("===== small train set ====")
        
        data = []
        labels = []

        for i in range(len(train_samples)):
            data.append(train_samples[i].text)
            labels.append(train_samples[i].label)


        train_data = CustomTextDataset(labels,data,batch_size=batch_size,repeated_label=False)
        train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)



        print("===== validation set ====")

        data = []
        labels = []

        for i in range(len(valid_samples)):
            data.append(valid_samples[i].text)
            labels.append(valid_samples[i].label)

        valid_data = CustomTextDataset(labels,data,batch_size=batch_size,repeated_label=False)
        valid_loader = DataLoader(valid_data,batch_size=batch_size,shuffle=True)



        print("===== test set ====")

        data = []
        labels = []

        for i in range(len(test_samples)):
            data.append(test_samples[i].text)
            labels.append(test_samples[i].label)

        test_data = CustomTextDataset(labels,data,batch_size=batch_size,repeated_label=False)
        test_loader = DataLoader(test_data,batch_size=batch_size,shuffle=True)



         # got the number of unique classes from dataset
        num_class = len(np.unique(np.array(labels)))

         # get text label of uniqure classes
        unique_label = np.unique(np.array(labels))

         # map text label to index classes
        label_maps = {unique_label[i]: i for i in range(len(unique_label))}

        print("label_maps :",label_maps)
        print("num_class:",num_class)


        
        for model_name in model_names:



            exp_name = f'{model_name}_lr={lr}_t={temp}_{data_name}_{shot_name}'
            direct_name = f"{model_name}"

            print("direct_name :",direct_name)
            tokenizer = AutoTokenizer.from_pretrained(direct_name)
            config = AutoConfig.from_pretrained(direct_name)
            config.num_labels = num_class
            simcse = AutoModelForSequenceClassification.from_pretrained(direct_name,config=config)

            simcse = freeze_layers(simcse,freeze_layers_count=12)
            optimizer= AdamW(simcse.parameters(), lr=lr)
            simcse = simcse.to(device)

            train_log, valid_log = train(exp_name,simcse,device,label_maps,optimizer,train_loader,valid_loader,train_data,valid_data,tokenizer,epochs=30)




            PATH = f'../../../fewshot_models/{exp_name}.pth'
            best_model = AutoModelForSequenceClassification.from_pretrained(direct_name,config=config)
            # Model class must be defined somewhere
            best_model.load_state_dict(torch.load(PATH))
            best_model = best_model.to(device)


            test_acc = test(best_model,device,label_maps,test_loader,len(test_data),tokenizer)

            test_acc = 100 * test_acc
            res = f'shot:{shot_name}_data_name:{data_name}_model:{model_name}_test_acc:{str(test_acc)}'
            lines.append(res)


with open(f'result_baseline.txt', 'w') as f:
    for line in lines:
        f.write(line)
        f.write('\n')

===== small train set ====
Train on Cross Entropy loss
len of dataset : 750
===== validation set ====
Train on Cross Entropy loss
len of dataset : 3000
===== test set ====
Train on Cross Entropy loss
len of dataset : 4500
label_maps : {'accept_reservations': 0, 'account_blocked': 1, 'alarm': 2, 'application_status': 3, 'apr': 4, 'are_you_a_bot': 5, 'balance': 6, 'bill_balance': 7, 'bill_due': 8, 'book_flight': 9, 'book_hotel': 10, 'calculator': 11, 'calendar': 12, 'calendar_update': 13, 'calories': 14, 'cancel': 15, 'cancel_reservation': 16, 'car_rental': 17, 'card_declined': 18, 'carry_on': 19, 'change_accent': 20, 'change_ai_name': 21, 'change_language': 22, 'change_speed': 23, 'change_user_name': 24, 'change_volume': 25, 'confirm_reservation': 26, 'cook_time': 27, 'credit_limit': 28, 'credit_limit_change': 29, 'credit_score': 30, 'current_location': 31, 'damaged_card': 32, 'date': 33, 'definition': 34, 'direct_deposit': 35, 'directions': 36, 'distance': 37, 'do_you_have_pets': 38, '

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

classifier.dense.weight
classifier.dense.bias
classifier.out_proj.weight
classifier.out_proj.bias
 Training Loss: 0.3287872486114502, 		 Training acc: 0.006666666828095913
train correct :  tensor(5, device='cuda:3')
train total : 750
 Validation Loss: 0.32139922221501666, 		 Validation acc: 0.006666666828095913
valid correct :  tensor(20, device='cuda:3')
valid total : 3000
Validation Loss Decreased(inf--->0.321399) 	 Saving The Model
 Training Loss: 0.3218516165415446, 		 Training acc: 0.009333333000540733
train correct :  tensor(7, device='cuda:3')
train total : 750
 Validation Loss: 0.31745521195729576, 		 Validation acc: 0.006666666828095913
valid correct :  tensor(20, device='cuda:3')
valid total : 3000
Validation Loss Decreased(0.321399--->0.317455) 	 Saving The Model
 Training Loss: 0.316168202718099, 		 Training acc: 0.01599999889731407
train correct :  tensor(12, device='cuda:3')
train total : 750
 Validation Loss: 0.30723544692993165, 		 Validation acc: 0.014333332888782024
v

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

correct : 2577
total : 4500
direct_name : bert-base-uncased


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

classifier.weight
classifier.bias
 Training Loss: 0.3766019325256348, 		 Training acc: 0.0026666666381061077
train correct :  tensor(2, device='cuda:3')
train total : 750
 Validation Loss: 0.3321163462003072, 		 Validation acc: 0.009999999776482582
valid correct :  tensor(30, device='cuda:3')
valid total : 3000
Validation Loss Decreased(inf--->0.332116) 	 Saving The Model
 Training Loss: 0.3363286164601644, 		 Training acc: 0.003999999724328518
train correct :  tensor(3, device='cuda:3')
train total : 750
 Validation Loss: 0.32896368932724, 		 Validation acc: 0.01966666616499424
valid correct :  tensor(59, device='cuda:3')
valid total : 3000
Validation Loss Decreased(0.332116--->0.328964) 	 Saving The Model
 Training Loss: 0.3266065273284912, 		 Training acc: 0.006666666828095913
train correct :  tensor(5, device='cuda:3')
train total : 750
 Validation Loss: 0.3188712166150411, 		 Validation acc: 0.01899999938905239
valid correct :  tensor(57, device='cuda:3')
valid total : 3000
Valida

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

correct : 736
total : 4500
===== small train set ====
Train on Cross Entropy loss
len of dataset : 385
===== validation set ====
Train on Cross Entropy loss
len of dataset : 1540
===== test set ====
Train on Cross Entropy loss
len of dataset : 3080
label_maps : {'Refund_not_showing_up': 0, 'activate_my_card': 1, 'age_limit': 2, 'apple_pay_or_google_pay': 3, 'atm_support': 4, 'automatic_top_up': 5, 'balance_not_updated_after_bank_transfer': 6, 'balance_not_updated_after_cheque_or_cash_deposit': 7, 'beneficiary_not_allowed': 8, 'cancel_transfer': 9, 'card_about_to_expire': 10, 'card_acceptance': 11, 'card_arrival': 12, 'card_delivery_estimate': 13, 'card_linking': 14, 'card_not_working': 15, 'card_payment_fee_charged': 16, 'card_payment_not_recognised': 17, 'card_payment_wrong_exchange_rate': 18, 'card_swallowed': 19, 'cash_withdrawal_charge': 20, 'cash_withdrawal_not_recognised': 21, 'change_pin': 22, 'compromised_card': 23, 'contactless_not_working': 24, 'country_support': 25, 'decline

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

classifier.dense.weight
classifier.dense.bias
classifier.out_proj.weight
classifier.out_proj.bias
 Training Loss: 0.30141581374329407, 		 Training acc: 0.002597402548417449
train correct :  tensor(1, device='cuda:3')
train total : 385
 Validation Loss: 0.27811273197074987, 		 Validation acc: 0.013636363670229912
valid correct :  tensor(21, device='cuda:3')
valid total : 1540
Validation Loss Decreased(inf--->0.278113) 	 Saving The Model
 Training Loss: 0.2903635359429694, 		 Training acc: 0.0181818176060915
train correct :  tensor(7, device='cuda:3')
train total : 385
 Validation Loss: 0.2725876628578483, 		 Validation acc: 0.02532467432320118
valid correct :  tensor(39, device='cuda:3')
valid total : 1540
Validation Loss Decreased(0.278113--->0.272588) 	 Saving The Model
 Training Loss: 0.2877753431146795, 		 Training acc: 0.0181818176060915
train correct :  tensor(7, device='cuda:3')
train total : 385
 Validation Loss: 0.26908428065188517, 		 Validation acc: 0.02922077849507332
valid 

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

correct : 1104
total : 3080
direct_name : bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

classifier.weight
classifier.bias
 Training Loss: 0.33958137685602363, 		 Training acc: 0.010389610193669796
train correct :  tensor(4, device='cuda:3')
train total : 385
 Validation Loss: 0.29072538103376117, 		 Validation acc: 0.016883116215467453
valid correct :  tensor(26, device='cuda:3')
valid total : 1540
Validation Loss Decreased(inf--->0.290725) 	 Saving The Model
 Training Loss: 0.2982158951945119, 		 Training acc: 0.015584414824843407
train correct :  tensor(6, device='cuda:3')
train total : 385
 Validation Loss: 0.27844812188829693, 		 Validation acc: 0.016233766451478004
valid correct :  tensor(25, device='cuda:3')
valid total : 1540
Validation Loss Decreased(0.290725--->0.278448) 	 Saving The Model
 Training Loss: 0.288932753847791, 		 Training acc: 0.023376623168587685
train correct :  tensor(9, device='cuda:3')
train total : 385
 Validation Loss: 0.27444963981578874, 		 Validation acc: 0.027272727340459824
valid correct :  tensor(42, device='cuda:3')
valid total : 1540


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

correct : 333
total : 3080
===== small train set ====
Train on Cross Entropy loss
len of dataset : 320
===== validation set ====
Train on Cross Entropy loss
len of dataset : 1076
===== test set ====
Train on Cross Entropy loss
len of dataset : 1076
label_maps : {'alarm_query': 0, 'alarm_remove': 1, 'alarm_set': 2, 'audio_volume_down': 3, 'audio_volume_mute': 4, 'audio_volume_up': 5, 'calendar_query': 6, 'calendar_remove': 7, 'calendar_set': 8, 'cooking_recipe': 9, 'datetime_convert': 10, 'datetime_query': 11, 'email_addcontact': 12, 'email_query': 13, 'email_querycontact': 14, 'email_sendemail': 15, 'general_affirm': 16, 'general_commandstop': 17, 'general_confirm': 18, 'general_dontcare': 19, 'general_explain': 20, 'general_joke': 21, 'general_negate': 22, 'general_praise': 23, 'general_quirky': 24, 'general_repeat': 25, 'iot_cleaning': 26, 'iot_coffee': 27, 'iot_hue_lightchange': 28, 'iot_hue_lightdim': 29, 'iot_hue_lightoff': 30, 'iot_hue_lighton': 31, 'iot_hue_lightup': 32, 'iot_we

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

classifier.dense.weight
classifier.dense.bias
classifier.out_proj.weight
classifier.out_proj.bias
 Training Loss: 0.27455212324857714, 		 Training acc: 0.012500000186264515
train correct :  tensor(4, device='cuda:3')
train total : 320
 Validation Loss: 0.26227923485426213, 		 Validation acc: 0.03345724940299988
valid correct :  tensor(36, device='cuda:3')
valid total : 1076
Validation Loss Decreased(inf--->0.262279) 	 Saving The Model
 Training Loss: 0.2644203782081604, 		 Training acc: 0.01875000074505806
train correct :  tensor(6, device='cuda:3')
train total : 320
 Validation Loss: 0.25081397897691976, 		 Validation acc: 0.05297397822141647
valid correct :  tensor(57, device='cuda:3')
valid total : 1076
Validation Loss Decreased(0.262279--->0.250814) 	 Saving The Model
 Training Loss: 0.25099684968590735, 		 Training acc: 0.04374999925494194
train correct :  tensor(14, device='cuda:3')
train total : 320
 Validation Loss: 0.24073231862823316, 		 Validation acc: 0.056691449135541916
v

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

correct : 605
total : 1076
direct_name : bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

classifier.weight
classifier.bias
 Training Loss: 0.30476497262716296, 		 Training acc: 0.0062500000931322575
train correct :  tensor(2, device='cuda:3')
train total : 320
 Validation Loss: 0.26737168065677347, 		 Validation acc: 0.02044609561562538
valid correct :  tensor(22, device='cuda:3')
valid total : 1076
Validation Loss Decreased(inf--->0.267372) 	 Saving The Model
 Training Loss: 0.27183626890182494, 		 Training acc: 0.02812500111758709
train correct :  tensor(9, device='cuda:3')
train total : 320
 Validation Loss: 0.2602196065466643, 		 Validation acc: 0.06133829057216644
valid correct :  tensor(66, device='cuda:3')
valid total : 1076
Validation Loss Decreased(0.267372--->0.260220) 	 Saving The Model
 Training Loss: 0.2605926342308521, 		 Training acc: 0.02187499962747097
train correct :  tensor(7, device='cuda:3')
train total : 320
 Validation Loss: 0.2518755842318765, 		 Validation acc: 0.0706319659948349
valid correct :  tensor(76, device='cuda:3')
valid total : 1076
Valid

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

correct : 272
total : 1076
===== small train set ====
Train on Cross Entropy loss
len of dataset : 1500
===== validation set ====
Train on Cross Entropy loss
len of dataset : 3000
===== test set ====
Train on Cross Entropy loss
len of dataset : 4500
label_maps : {'accept_reservations': 0, 'account_blocked': 1, 'alarm': 2, 'application_status': 3, 'apr': 4, 'are_you_a_bot': 5, 'balance': 6, 'bill_balance': 7, 'bill_due': 8, 'book_flight': 9, 'book_hotel': 10, 'calculator': 11, 'calendar': 12, 'calendar_update': 13, 'calories': 14, 'cancel': 15, 'cancel_reservation': 16, 'car_rental': 17, 'card_declined': 18, 'carry_on': 19, 'change_accent': 20, 'change_ai_name': 21, 'change_language': 22, 'change_speed': 23, 'change_user_name': 24, 'change_volume': 25, 'confirm_reservation': 26, 'cook_time': 27, 'credit_limit': 28, 'credit_limit_change': 29, 'credit_score': 30, 'current_location': 31, 'damaged_card': 32, 'date': 33, 'definition': 34, 'direct_deposit': 35, 'directions': 36, 'distance': 3

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

classifier.dense.weight
classifier.dense.bias
classifier.out_proj.weight
classifier.out_proj.bias
 Training Loss: 0.32674006843566894, 		 Training acc: 0.006666666828095913
train correct :  tensor(10, device='cuda:3')
train total : 1500
 Validation Loss: 0.312853280544281, 		 Validation acc: 0.02199999988079071
valid correct :  tensor(66, device='cuda:3')
valid total : 3000
Validation Loss Decreased(inf--->0.312853) 	 Saving The Model
 Training Loss: 0.30889245732625326, 		 Training acc: 0.01599999889731407
train correct :  tensor(24, device='cuda:3')
train total : 1500
 Validation Loss: 0.2944926199913025, 		 Validation acc: 0.04100000113248825
valid correct :  tensor(123, device='cuda:3')
valid total : 3000
Validation Loss Decreased(0.312853--->0.294493) 	 Saving The Model
 Training Loss: 0.28536674547195434, 		 Training acc: 0.058666665107011795
train correct :  tensor(88, device='cuda:3')
train total : 1500
 Validation Loss: 0.26656060155232747, 		 Validation acc: 0.096999995410442

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

correct : 3284
total : 4500
direct_name : bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

classifier.weight
classifier.bias
 Training Loss: 0.35309058062235515, 		 Training acc: 0.009999999776482582
train correct :  tensor(15, device='cuda:3')
train total : 1500
 Validation Loss: 0.32853799978892007, 		 Validation acc: 0.008999999612569809
valid correct :  tensor(27, device='cuda:3')
valid total : 3000
Validation Loss Decreased(inf--->0.328538) 	 Saving The Model
 Training Loss: 0.32704574267069497, 		 Training acc: 0.0139999995008111
train correct :  tensor(21, device='cuda:3')
train total : 1500
 Validation Loss: 0.3260789279937744, 		 Validation acc: 0.026000000536441803
valid correct :  tensor(78, device='cuda:3')
valid total : 3000
Validation Loss Decreased(0.328538--->0.326079) 	 Saving The Model
 Training Loss: 0.3160190992355347, 		 Training acc: 0.03266666457056999
train correct :  tensor(49, device='cuda:3')
train total : 1500
 Validation Loss: 0.30744319868087766, 		 Validation acc: 0.035999998450279236
valid correct :  tensor(108, device='cuda:3')
valid total : 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

correct : 1340
total : 4500
===== small train set ====
Train on Cross Entropy loss
len of dataset : 770
===== validation set ====
Train on Cross Entropy loss
len of dataset : 1540
===== test set ====
Train on Cross Entropy loss
len of dataset : 3080
label_maps : {'Refund_not_showing_up': 0, 'activate_my_card': 1, 'age_limit': 2, 'apple_pay_or_google_pay': 3, 'atm_support': 4, 'automatic_top_up': 5, 'balance_not_updated_after_bank_transfer': 6, 'balance_not_updated_after_cheque_or_cash_deposit': 7, 'beneficiary_not_allowed': 8, 'cancel_transfer': 9, 'card_about_to_expire': 10, 'card_acceptance': 11, 'card_arrival': 12, 'card_delivery_estimate': 13, 'card_linking': 14, 'card_not_working': 15, 'card_payment_fee_charged': 16, 'card_payment_not_recognised': 17, 'card_payment_wrong_exchange_rate': 18, 'card_swallowed': 19, 'cash_withdrawal_charge': 20, 'cash_withdrawal_not_recognised': 21, 'change_pin': 22, 'compromised_card': 23, 'contactless_not_working': 24, 'country_support': 25, 'declin

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

classifier.dense.weight
classifier.dense.bias
classifier.out_proj.weight
classifier.out_proj.bias
 Training Loss: 0.2891888630854619, 		 Training acc: 0.00909090880304575
train correct :  tensor(7, device='cuda:3')
train total : 770
 Validation Loss: 0.2741974697484599, 		 Validation acc: 0.023376623168587685
valid correct :  tensor(36, device='cuda:3')
valid total : 1540
Validation Loss Decreased(inf--->0.274197) 	 Saving The Model
 Training Loss: 0.2812679916233211, 		 Training acc: 0.019480518996715546
train correct :  tensor(15, device='cuda:3')
train total : 770
 Validation Loss: 0.26704729702565577, 		 Validation acc: 0.027922077104449272
valid correct :  tensor(43, device='cuda:3')
valid total : 1540
Validation Loss Decreased(0.274197--->0.267047) 	 Saving The Model
 Training Loss: 0.2733375010552344, 		 Training acc: 0.04415584355592728
train correct :  tensor(34, device='cuda:3')
train total : 770
 Validation Loss: 0.2615738633391145, 		 Validation acc: 0.03246753290295601
val

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

correct : 1652
total : 3080
direct_name : bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

classifier.weight
classifier.bias
 Training Loss: 0.3156304390399487, 		 Training acc: 0.014285714365541935
train correct :  tensor(11, device='cuda:3')
train total : 770
 Validation Loss: 0.28351000315183167, 		 Validation acc: 0.022727271541953087
valid correct :  tensor(35, device='cuda:3')
valid total : 1540
Validation Loss Decreased(inf--->0.283510) 	 Saving The Model
 Training Loss: 0.28713512358727394, 		 Training acc: 0.019480518996715546
train correct :  tensor(15, device='cuda:3')
train total : 770
 Validation Loss: 0.28332246867093175, 		 Validation acc: 0.022727271541953087
valid correct :  tensor(35, device='cuda:3')
valid total : 1540
Validation Loss Decreased(0.283510--->0.283322) 	 Saving The Model
 Training Loss: 0.2865429085570496, 		 Training acc: 0.025974025949835777
train correct :  tensor(20, device='cuda:3')
train total : 770
 Validation Loss: 0.28457866287850714, 		 Validation acc: 0.026623375713825226
valid correct :  tensor(41, device='cuda:3')
valid total : 1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

correct : 488
total : 3080
===== small train set ====
Train on Cross Entropy loss
len of dataset : 640
===== validation set ====
Train on Cross Entropy loss
len of dataset : 1076
===== test set ====
Train on Cross Entropy loss
len of dataset : 1076
label_maps : {'alarm_query': 0, 'alarm_remove': 1, 'alarm_set': 2, 'audio_volume_down': 3, 'audio_volume_mute': 4, 'audio_volume_up': 5, 'calendar_query': 6, 'calendar_remove': 7, 'calendar_set': 8, 'cooking_recipe': 9, 'datetime_convert': 10, 'datetime_query': 11, 'email_addcontact': 12, 'email_query': 13, 'email_querycontact': 14, 'email_sendemail': 15, 'general_affirm': 16, 'general_commandstop': 17, 'general_confirm': 18, 'general_dontcare': 19, 'general_explain': 20, 'general_joke': 21, 'general_negate': 22, 'general_praise': 23, 'general_quirky': 24, 'general_repeat': 25, 'iot_cleaning': 26, 'iot_coffee': 27, 'iot_hue_lightchange': 28, 'iot_hue_lightdim': 29, 'iot_hue_lightoff': 30, 'iot_hue_lighton': 31, 'iot_hue_lightup': 32, 'iot_we

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

classifier.dense.weight
classifier.dense.bias
classifier.out_proj.weight
classifier.out_proj.bias
 Training Loss: 0.26967798806726934, 		 Training acc: 0.02031250111758709
train correct :  tensor(13, device='cuda:3')
train total : 640
 Validation Loss: 0.25265850564361064, 		 Validation acc: 0.03531598299741745
valid correct :  tensor(38, device='cuda:3')
valid total : 1076
Validation Loss Decreased(inf--->0.252659) 	 Saving The Model
 Training Loss: 0.25276908800005915, 		 Training acc: 0.046875
train correct :  tensor(30, device='cuda:3')
train total : 640
 Validation Loss: 0.23820567707146853, 		 Validation acc: 0.06133829057216644
valid correct :  tensor(66, device='cuda:3')
valid total : 1076
Validation Loss Decreased(0.252659--->0.238206) 	 Saving The Model
 Training Loss: 0.23905664421617984, 		 Training acc: 0.07656250149011612
train correct :  tensor(49, device='cuda:3')
train total : 640
 Validation Loss: 0.22927315040148766, 		 Validation acc: 0.06133829057216644
valid corre

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

correct : 685
total : 1076
direct_name : bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

classifier.weight
classifier.bias
 Training Loss: 0.2955379605293274, 		 Training acc: 0.01718750037252903
train correct :  tensor(11, device='cuda:3')
train total : 640
 Validation Loss: 0.2700623643885758, 		 Validation acc: 0.026022303849458694
valid correct :  tensor(28, device='cuda:3')
valid total : 1076
Validation Loss Decreased(inf--->0.270062) 	 Saving The Model
 Training Loss: 0.2707455702126026, 		 Training acc: 0.02812500111758709
train correct :  tensor(18, device='cuda:3')
train total : 640
 Validation Loss: 0.26352239740825495, 		 Validation acc: 0.0511152409017086
valid correct :  tensor(55, device='cuda:3')
valid total : 1076
Validation Loss Decreased(0.270062--->0.263522) 	 Saving The Model
 Training Loss: 0.26060586869716645, 		 Training acc: 0.05312500149011612
train correct :  tensor(34, device='cuda:3')
train total : 640
 Validation Loss: 0.25279310711254416, 		 Validation acc: 0.06133829057216644
valid correct :  tensor(66, device='cuda:3')
valid total : 1076
Val

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

correct : 296
total : 1076
