In [None]:
"""Inspired by https://www.kaggle.com/code/giovanni11/finetuning-bertweet-classification-score-85"""

In [24]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-53c1b306-6ea9-75ca-22c4-aba8d7fc00c7)


In [2]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
#import dependencies
import csv
import os
import torch
from transformers import pipeline
import gc
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.special import softmax
from simpletransformers.classification import (ClassificationModel, ClassificationArgs)
import sklearn
from sklearn.model_selection import train_test_split

In [4]:
training_df = train[["text", "target"]]
training_df.columns = ["text", "labels"]
shuffled_training = training_df.sample(frac=1).reset_index(drop=True)

In [5]:
train_df, test_df = train_test_split(training_df, test_size=0.15, random_state=42, stratify=training_df["labels"])
eval_df, test_df = train_test_split(test_df, test_size=0.50, random_state=42, stratify=test_df["labels"])

In [6]:
testing_df = train[["text", "target"]]
testing_df.columns = ["text", "labels"]
shuffled_testing = testing_df.sample(frac=1).reset_index(drop=True)

In [7]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

In [6]:
#basic model
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 60
model_args.eval_batch_size = 60

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= False)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

'\nlearning_rate\tfloat\t4e-5\noptimizer\tstr\t“AdamW”\tShould be one of (AdamW, Adafactor)\ntrain_batch_size\tint\t8\tThe training batch size.\nuse_early_stopping\tbool\tFalse\tUse early stopping to stop training when early_stopping_metric doesn’t improve (based on early_stopping_patience, and early_stopping_delta)\nweight_decay\tint\t0\tAdds L2 penalty.\n'

In [7]:
model.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

(254, 0.39347208831019287)

In [8]:
result, model_outputs, wrong_predictions = model.eval_model(shuffled_training, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/127 [00:00<?, ?it/s]

In [9]:
result

{'mcc': 0.799600195931172,
 'tp': 2747,
 'tn': 4117,
 'fp': 225,
 'fn': 524,
 'auroc': 0.9421985579906669,
 'auprc': 0.9450930168674038,
 'acc': 0.9016156574280836,
 'f1': 0.8800256287041487,
 'eval_loss': 0.2747164681319177}

In [11]:
test_result, test_model_outputs, test_wrong_predictions = model.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/127 [00:00<?, ?it/s]

In [12]:
test_result

{'mcc': 0.799600195931172,
 'tp': 2747,
 'tn': 4117,
 'fp': 225,
 'fn': 524,
 'auroc': 0.9421985579906669,
 'auprc': 0.9450930168674038,
 'acc': 0.9016156574280836,
 'f1': 0.8800256287041487,
 'eval_loss': 0.2747073825888746}

In [8]:
# Create a ClassificationModel 2
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True 
model_args.reprocess_input_data = True
model_args.train_batch_size = 60
model_args.eval_batch_size = 60

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1
model_args.learning_rate =	4e-4

model_2 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= False)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [9]:
model_2.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)


  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

(254, 0.6887619557577794)

In [12]:
test_result_2, test_model_outputs_2, test_wrong_predictions_2 = model_2.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_2

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/127 [00:00<?, ?it/s]

{'mcc': 0.0,
 'tp': 0,
 'tn': 4342,
 'fp': 0,
 'fn': 3271,
 'auroc': 0.4296687062344985,
 'auprc': 0.3782241491706396,
 'acc': 0.5703402075397347,
 'f1': 0.0,
 'eval_loss': 0.683843997519786}

In [35]:
See: "https://github.com/huggingface/transformers/issues/7789"

# Create a ClassificationModel 3
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True #this enables the built-in Bertweet custom tokenizer

model_args.reprocess_input_data = True
#odel_args.evaluate_during_training = True
#model_args.evaluate_during_training_verbose = True
model_args.train_batch_size = 60
model_args.eval_batch_size = 60
model_args.optimizer = 'Adafactor'

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

#model_args.learning_rate =	4e-4

model_args.adafactor_relative_step = False
model_args.adafactor_warmup_init = False


model_3 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [36]:
model_3.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

(254, 0.6145476859620237)

In [37]:
test_result_3, test_model_outputs_3, test_wrong_predictions_3 = model_3.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_3

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/127 [00:00<?, ?it/s]

{'mcc': 0.6195446497387139,
 'tp': 2332,
 'tn': 3868,
 'fp': 474,
 'fn': 939,
 'auroc': 0.8701838849873567,
 'auprc': 0.863376591658471,
 'acc': 0.8143964271640615,
 'f1': 0.7674839558992923,
 'eval_loss': 0.534264358244543}

In [39]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True

model_args.reprocess_input_data = True
model_args.train_batch_size = 8
model_args.eval_batch_size = 8

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_4 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [40]:
model_4.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/952 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/952 [00:00<?, ?it/s]

(1904, 0.41121018723929065)

In [41]:
test_result_4, test_model_outputs_4, test_wrong_predictions_4 = model_4.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_4
#way faster training

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/952 [00:00<?, ?it/s]

{'mcc': 0.8169042353947938,
 'tp': 2760,
 'tn': 4166,
 'fp': 176,
 'fn': 511,
 'auroc': 0.9547375629476179,
 'auprc': 0.9541853330841379,
 'acc': 0.9097596216997241,
 'f1': 0.8893185113581441,
 'eval_loss': 0.2677887557935314}

In [42]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True

model_args.reprocess_input_data = True
model_args.train_batch_size = 4
model_args.eval_batch_size = 4

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_5 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [43]:
model_5.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/1904 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/1904 [00:00<?, ?it/s]

(3808, 0.49949040457981975)

In [44]:
test_result_5, test_model_outputs_5, test_wrong_predictions_5 = model_5.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_5
#way faster training

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1904 [00:00<?, ?it/s]

{'mcc': 0.8061373841500632,
 'tp': 2746,
 'tn': 4141,
 'fp': 201,
 'fn': 525,
 'auroc': 0.9405060959613121,
 'auprc': 0.9430092325307962,
 'acc': 0.9046368054643373,
 'f1': 0.8832422000643293,
 'eval_loss': 0.3318588137626648}

In [54]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True

model_args.reprocess_input_data = True
model_args.train_batch_size = 12
model_args.eval_batch_size = 12

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_6 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [55]:
model_6.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/635 [00:00<?, ?it/s]

(1270, 0.4029964753081949)

In [56]:
test_result_6, test_model_outputs_6, test_wrong_predictions_6 = model_6.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_6
#way faster training

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/635 [00:00<?, ?it/s]

{'mcc': 0.8172174716279571,
 'tp': 2759,
 'tn': 4168,
 'fp': 174,
 'fn': 512,
 'auroc': 0.9509372595964622,
 'auprc': 0.9517334953510835,
 'acc': 0.90989097596217,
 'f1': 0.8894261766602192,
 'eval_loss': 0.26344379886515495}

In [None]:
#learning rate

In [None]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

In [7]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 60
model_args.eval_batch_size = 60
model_args.learning_rate = 1e-5

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_7 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [8]:
model_7.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

(254, 0.4464520308680422)

In [11]:
test_result_7, test_model_outputs_7, test_wrong_predictions_7 = model_7.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_7


  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/127 [00:00<?, ?it/s]

{'mcc': 0.7213057112613389,
 'tp': 2638,
 'tn': 3939,
 'fp': 403,
 'fn': 633,
 'auroc': 0.9104546239928486,
 'auprc': 0.912259420828821,
 'acc': 0.8639169841061343,
 'f1': 0.8358681875792141,
 'eval_loss': 0.35632072999252107}

In [60]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

In [None]:
#8 is more epochs

In [61]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=3, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True 
model_args.reprocess_input_data = True
model_args.train_batch_size = 12
model_args.eval_batch_size = 12

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_8 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [62]:
model_8.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/635 [00:00<?, ?it/s]

(1905, 0.35161708522284907)

In [63]:
test_result_8, test_model_outputs_8, test_wrong_predictions_8 = model_8.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_8
#way faster training

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/635 [00:00<?, ?it/s]

{'mcc': 0.8710201662074972,
 'tp': 2905,
 'tn': 4224,
 'fp': 118,
 'fn': 366,
 'auroc': 0.9715604770986213,
 'auprc': 0.9710324252763317,
 'acc': 0.9364245369762249,
 'f1': 0.92310136638068,
 'eval_loss': 0.20129665636171507}

In [64]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

In [None]:
#8 is more epochs

In [65]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=1, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization
model_args.reprocess_input_data = True
model_args.train_batch_size = 12
model_args.eval_batch_size = 12

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_9 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [66]:
model_9.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/635 [00:00<?, ?it/s]

(635, 0.44465042929363063)

In [67]:
test_result_9, test_model_outputs_9, test_wrong_predictions_9 = model_9.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_9
#way faster training

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/635 [00:00<?, ?it/s]

{'mcc': 0.7581707592286032,
 'tp': 2645,
 'tn': 4065,
 'fp': 277,
 'fn': 626,
 'auroc': 0.9299514697294496,
 'auprc': 0.9316169944848922,
 'acc': 0.8813871010114278,
 'f1': 0.8541902147585985,
 'eval_loss': 0.31429139175048965}

In [68]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

In [69]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=4, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 12
model_args.eval_batch_size = 12

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_10 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [70]:
model_10.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/635 [00:00<?, ?it/s]

(2540, 0.3230254058927796)

In [71]:
test_result_10, test_model_outputs_10, test_wrong_predictions_10 = model_10.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_10
#way faster training

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/635 [00:00<?, ?it/s]

{'mcc': 0.9048845584304099,
 'tp': 2994,
 'tn': 4262,
 'fp': 80,
 'fn': 277,
 'auroc': 0.9864592476266103,
 'auprc': 0.9851139590845678,
 'acc': 0.9531065283068435,
 'f1': 0.9437352245862884,
 'eval_loss': 0.14455998975739587}

In [8]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

In [9]:
# Create a ClassificationModel 2
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 60
model_args.eval_batch_size = 60


model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1
model_args.learning_rate =	2e-5

model_11 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [10]:
model_11.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)


  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/127 [00:00<?, ?it/s]

(254, 0.41058652038414645)

In [11]:
test_result_11, test_model_outputs_11, test_wrong_predictions_11 = model_11.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_11

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/127 [00:00<?, ?it/s]

{'mcc': 0.7628635795361931,
 'tp': 2651,
 'tn': 4076,
 'fp': 266,
 'fn': 620,
 'auroc': 0.9278653848618169,
 'auprc': 0.9306755212593504,
 'acc': 0.8836201234730067,
 'f1': 0.8568196509372981,
 'eval_loss': 0.3120873967024285}

In [12]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

In [14]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=5, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True #this enables the built-in Bertweet custom tokenizer

model_args.reprocess_input_data = True
#odel_args.evaluate_during_training = True
#model_args.evaluate_during_training_verbose = True
model_args.train_batch_size = 12
model_args.eval_batch_size = 12

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_12 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [15]:
model_12.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/635 [00:00<?, ?it/s]

(3175, 0.29107077941753146)

In [16]:
test_result_12, test_model_outputs_12, test_wrong_predictions_12 = model_12.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_12
#way faster training

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/635 [00:00<?, ?it/s]

{'mcc': 0.931563195194834,
 'tp': 3083,
 'tn': 4274,
 'fp': 68,
 'fn': 188,
 'auroc': 0.9938622860104872,
 'auprc': 0.9929696290682473,
 'acc': 0.966373308813871,
 'f1': 0.96013702896294,
 'eval_loss': 0.09746183580645602}

In [None]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

In [17]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=6, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True 
model_args.reprocess_input_data = True
model_args.train_batch_size = 12
model_args.eval_batch_size = 12

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_13 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [18]:
model_13.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Running Epoch 0 of 6:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 1 of 6:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 2 of 6:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 3 of 6:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 4 of 6:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 5 of 6:   0%|          | 0/635 [00:00<?, ?it/s]

(3810, 0.27088716509000316)

In [19]:
test_result_13, test_model_outputs_13, test_wrong_predictions_13 = model_13.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_13
#way faster training

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/635 [00:00<?, ?it/s]

{'mcc': 0.9501556809769393,
 'tp': 3149,
 'tn': 4278,
 'fp': 64,
 'fn': 122,
 'auroc': 0.9965002032714667,
 'auprc': 0.9951678079910616,
 'acc': 0.9755681071850781,
 'f1': 0.9713140037014188,
 'eval_loss': 0.0768737896528887}

In [20]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

In [21]:
# Create a ClassificationModel 1
model_args = ClassificationArgs(num_train_epochs=7, 
                                overwrite_output_dir=True)
model_args.manual_seed = 64
model_args.best_model_dir = "/best_model"
model_args.output_dir = "/output"
model_args.normalization = True #this enables the built-in Bertweet custom tokenizer
model_args.reprocess_input_data = True
model_args.train_batch_size = 12
model_args.eval_batch_size = 12

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model_14 = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2,use_cuda= True)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [22]:
model_14.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score,no_deprecation_warning=True)

  0%|          | 0/7613 [00:00<?, ?it/s]



Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 0 of 7:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 1 of 7:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 2 of 7:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 3 of 7:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 4 of 7:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 5 of 7:   0%|          | 0/635 [00:00<?, ?it/s]

Running Epoch 6 of 7:   0%|          | 0/635 [00:00<?, ?it/s]

(4445, 0.2506633456078797)

In [23]:
test_result_14, test_model_outputs_14, test_wrong_predictions_14 = model_14.eval_model(shuffled_testing, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)
test_result_14
#way faster training

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/635 [00:00<?, ?it/s]

{'mcc': 0.9549656587396433,
 'tp': 3165,
 'tn': 4280,
 'fp': 62,
 'fn': 106,
 'auroc': 0.9969833514543238,
 'auprc': 0.9960411035874388,
 'acc': 0.9779324839091028,
 'f1': 0.9741458910433979,
 'eval_loss': 0.07252642762055254}