In [None]:
!git clone https://www.github.com/nvidia/apex
!cd ..
!pip install -v --no-cache-dir ./apex
!pip install simpletransformers

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import StratifiedShuffleSplit, GroupShuffleSplit
from simpletransformers.ner import NERModel
from seqeval.metrics import f1_score, classification_report

In [2]:
shuffle_split = GroupShuffleSplit(n_splits=5, test_size=0.3, random_state=2021)

In [4]:
df = pd.read_csv("disengagement_aggregated.csv")
df.words = df.words.astype('str')
df_post_train = pd.read_csv("post_train.csv")
df_post_test = pd.read_csv("post_test.csv")
df_post_train.words = df_post_train.words.astype(str)
df_post_train.labels = df_post_train.labels.astype(str)
df_post_test.words = df_post_test.words.astype(str)
df_post_test.labels = df_post_test.labels.astype(str)

## Post train

In [None]:
start = time.time()
tags = ['O', 'B-C', 'I-C', 'B-E', 'I-E', 'B-CE', 'I-CE']
model_args = {
    "output_dir": f"post_output/",
    'best_model_dir': f"post_output/best_model/",
    "manual_seed": 2020,
    "do_lower_case": True,
    "num_train_epochs": 8,
    "learning_rate": 5e-5,
    "max_seq_length": 128,

    "evaluate_during_training": True, 
    "evaluate_during_training_verbose": True,
    "evaluate_during_training_steps": 4450,
    "save_eval_checkpoints": False,

    "use_early_stopping": False,
    "early_stopping_delta": 0.001,
    "early_stopping_metric": "f1_score",
    "early_stopping_metric_minimize": False,
    "early_stopping_patience": 3,

    'overwrite_output_dir': True,
}
labels = df_post_test.groupby(['sentence_id'])['labels'].apply(list).to_list()
#model = NERModel('xlnet', 'xlnet-base-cased', labels=tags, use_cuda = True, args=model_args)
model = NERModel('roberta', 'roberta-base', labels=tags, use_cuda = True, args=model_args)
model.train_model(df_post_train, eval_data = df_post_test)
result, model_outputs, predictions = model.eval_model(df_post_test)
end = time.time()
print(end - start)


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]



Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/557 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/99 [00:00<?, ?it/s]

818.2565560340881


In [None]:
for _, test_index in shuffle_split.split(df.words, df.labels, groups=df.sentence_id):
  df_test = df.loc[test_index,:]
  tags = ['O', 'B-C', 'I-C', 'B-E', 'I-E', 'B-CE', 'I-CE']
  model_args = {
    "output_dir": f"post_output/",
    'best_model_dir': f"post_output/best_model/",
    "manual_seed": 2020,
    "do_lower_case": True,
    "num_train_epochs": 8,
    "learning_rate": 5e-5,
    "max_seq_length": 128,

    "evaluate_during_training": True, 
    "evaluate_during_training_verbose": True,
    "evaluate_during_training_steps": 4450,
    "save_eval_checkpoints": False,

    "use_early_stopping": False,
    "early_stopping_delta": 0.001,
    "early_stopping_metric": "f1_score",
    "early_stopping_metric_minimize": False,
    "early_stopping_patience": 3,

    'overwrite_output_dir': True,
}
  labels = df_test.groupby(['sentence_id'])['labels'].apply(list).to_list()
  model = NERModel('roberta', '/content/post_output/best_model/', labels=tags, use_cuda = True, args=model_args)
  result, model_outputs, predictions = model.eval_model(df_test)
  ind = []
  for i in range(len(labels)):
    if len(labels[i]) == len(predictions[i]):
      ind.append(i)
  labels = np.array(labels)[ind]
  predictions = np.array(predictions)[ind]
  print(classification_report(labels, predictions))

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]



              precision    recall  f1-score   support

           C       0.58      0.60      0.59       124
          CE       0.83      0.50      0.62        10
           E       0.48      0.49      0.48       100

   micro avg       0.54      0.55      0.54       234
   macro avg       0.63      0.53      0.57       234
weighted avg       0.55      0.55      0.54       234



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.57      0.65      0.61       114
          CE       1.00      0.33      0.50         6
           E       0.44      0.51      0.47        96

   micro avg       0.51      0.58      0.54       216
   macro avg       0.67      0.50      0.53       216
weighted avg       0.52      0.58      0.54       216



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.47      0.49      0.48       120
          CE       0.25      0.20      0.22         5
           E       0.43      0.48      0.46        95

   micro avg       0.45      0.48      0.46       220
   macro avg       0.38      0.39      0.39       220
weighted avg       0.45      0.48      0.46       220



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.49      0.59      0.53       111
          CE       0.50      0.33      0.40         3
           E       0.50      0.59      0.54        93

   micro avg       0.49      0.58      0.53       207
   macro avg       0.50      0.50      0.49       207
weighted avg       0.49      0.58      0.53       207



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.56      0.60      0.58       120
          CE       0.38      0.38      0.38         8
           E       0.52      0.52      0.52       101

   micro avg       0.54      0.56      0.55       229
   macro avg       0.49      0.50      0.49       229
weighted avg       0.54      0.56      0.55       229



In [None]:
weighted_f1 = (0.46+0.54+0.54+0.53+0.55) / 5
print(f"weighted_f1 for post train is {weighted_f1}")
weighted_f1_std = np.std([0.46, 0.54, 0.54, 0.53, 0.55])
print(f"weighted_f1 for post train is {weighted_f1_std}")

weighted_f1 for post train is 0.524
weighted_f1 for post train is 0.032619012860600184


## Save best post train model for later use

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp -r /content/post_output/best_model/ /content/drive/MyDrive/cause_and_effect/best_model/

cp: cannot create directory '/content/drive/MyDrive/cause_and_effect/best_model/': No such file or directory


## post train + fine tuning

In [None]:
start = time.time()
i = 0
for train_index, test_index in shuffle_split.split(df.words, df.labels, groups=df.sentence_id):
  df_train = df.loc[train_index,:]
  df_test = df.loc[test_index,:]
  tags = ['O', 'B-C', 'I-C', 'B-E', 'I-E', 'B-CE', 'I-CE']
  model_args = {
      "output_dir": f"electra_all_outputs_{i}/",
      'best_model_dir': f"electra_all_outputs_{i}/best_model/",
      "do_lower_case": False,
      "manual_seed": 2020,
      "num_train_epochs": 8,
      "learning_rate": 5e-5,
      "max_seq_length": 128,

      "evaluate_during_training": True, 
      "evaluate_during_training_verbose": True,
      "evaluate_during_training_steps": 4450,
      "save_eval_checkpoints": False,

      "use_early_stopping": True,
      "early_stopping_delta": 0.01,
      "early_stopping_metric": "f1_score",
      "early_stopping_metric_minimize": False,
      "early_stopping_patience": 3,

      'overwrite_output_dir': True,
      'save_optimizer_and_scheduler': False,
      'save_model_every_epoch': False,
      'save_steps': -1,
  }
  labels_ = df_test.groupby(['sentence_id'])['labels'].apply(list).to_list()
  model_ = NERModel('roberta', '/content/post_output/best_model/', labels=tags, use_cuda = True, args=model_args)

  model_.train_model(df_train, eval_data = df_test)
  result_, model_outputs_, predictions_ = model_.eval_model(df_test)
  ind = []
  for i in range(len(labels_)):
    if len(labels_[i]) == len(predictions_[i]):
      ind.append(i)
  labels_ = np.array(labels_)[ind]
  predictions_ = np.array(predictions_)[ind]
  print(classification_report(labels_, predictions_))
  i += 1
end = time.time()
print(end - start)

  0%|          | 0/2 [00:00<?, ?it/s]



Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]



              precision    recall  f1-score   support

           C       0.91      0.91      0.91       124
          CE       0.80      0.80      0.80        10
           E       0.90      0.90      0.90       100

   micro avg       0.90      0.90      0.90       234
   macro avg       0.87      0.87      0.87       234
weighted avg       0.90      0.90      0.90       234



  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.81      0.92      0.86       114
          CE       0.86      1.00      0.92         6
           E       0.85      0.90      0.87        96

   micro avg       0.83      0.91      0.87       216
   macro avg       0.84      0.94      0.89       216
weighted avg       0.83      0.91      0.87       216



  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.90      0.85      0.88       120
          CE       1.00      1.00      1.00         5
           E       0.87      0.86      0.87        95

   micro avg       0.89      0.86      0.88       220
   macro avg       0.92      0.90      0.91       220
weighted avg       0.89      0.86      0.87       220



  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.85      0.90      0.87       111
          CE       1.00      1.00      1.00         3
           E       0.80      0.89      0.84        93

   micro avg       0.83      0.90      0.86       207
   macro avg       0.88      0.93      0.91       207
weighted avg       0.83      0.90      0.86       207



  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.87      0.91      0.89       120
          CE       0.88      0.88      0.88         8
           E       0.83      0.90      0.86       101

   micro avg       0.85      0.90      0.88       229
   macro avg       0.86      0.89      0.87       229
weighted avg       0.85      0.90      0.88       229

709.2014241218567


In [None]:
i = 0
for train_index, test_index in shuffle_split.split(df.words, df.labels, groups=df.sentence_id):
  df_train = df.loc[train_index,:]
  df_test = df.loc[test_index,:]
  tags = ['O', 'B-C', 'I-C', 'B-E', 'I-E', 'B-CE', 'I-CE']
  model_args = {
      "output_dir": f"electra_all_outputs_{i}/",
      'best_model_dir': f"electra_all_outputs_{i}/best_model/",
      "do_lower_case": False,
      "manual_seed": 2020,
      "num_train_epochs": 8,
      "learning_rate": 5e-5,
      "max_seq_length": 128,

      "evaluate_during_training": True, 
      "evaluate_during_training_verbose": True,
      "evaluate_during_training_steps": 4450,
      "save_eval_checkpoints": False,

      "use_early_stopping": True,
      "early_stopping_delta": 0.01,
      "early_stopping_metric": "f1_score",
      "early_stopping_metric_minimize": False,
      "early_stopping_patience": 3,

      'overwrite_output_dir': True,
      'save_optimizer_and_scheduler': False,
      'save_model_every_epoch': False,
      'save_steps': -1,
  }
  labels_ = df_test.groupby(['sentence_id'])['labels'].apply(list).to_list()
  model_ = NERModel('roberta', f"electra_all_outputs_{i}/best_model/", labels=tags, use_cuda = True, args=model_args)
  result_, model_outputs_, predictions_ = model_.eval_model(df_test)
  ind = []
  for i in range(len(labels_)):
    if len(labels_[i]) == len(predictions_[i]):
      ind.append(i)
  labels_ = np.array(labels_)[ind]
  predictions_ = np.array(predictions_)[ind]
  print(classification_report(labels_, predictions_, 2))
  i += 1

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]



              precision    recall  f1-score   support

           C       0.90      0.91      0.91       124
          CE       0.90      0.90      0.90        10
           E       0.88      0.91      0.90       100

   micro avg       0.89      0.91      0.90       234
   macro avg       0.90      0.91      0.90       234
weighted avg       0.90      0.91      0.90       234



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.96      0.97      0.97       114
          CE       0.86      1.00      0.92         6
           E       0.93      0.95      0.94        96

   micro avg       0.94      0.96      0.95       216
   macro avg       0.91      0.97      0.94       216
weighted avg       0.94      0.96      0.95       216



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.96      0.94      0.95       120
          CE       1.00      1.00      1.00         5
           E       0.94      0.92      0.93        95

   micro avg       0.95      0.93      0.94       220
   macro avg       0.96      0.95      0.96       220
weighted avg       0.95      0.93      0.94       220



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.97      0.95      0.96       111
          CE       1.00      1.00      1.00         3
           E       0.89      0.91      0.90        93

   micro avg       0.93      0.94      0.93       207
   macro avg       0.95      0.96      0.95       207
weighted avg       0.93      0.94      0.94       207



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.87      0.91      0.89       120
          CE       0.88      0.88      0.88         8
           E       0.83      0.90      0.86       101

   micro avg       0.85      0.90      0.88       229
   macro avg       0.86      0.89      0.87       229
weighted avg       0.85      0.90      0.88       229



In [3]:
import numpy as np
weighted_f1 = (0.90 + 0.94 + 0.95 + 0.94 + 0.88) / 5
print(f"weighted_f1 for post train + fine tuning is {weighted_f1}")
weighted_f1_std = np.std([0.90, 0.94, 0.95, 0.94, 0.88])
print(f"weighted_f1 for post train + fine tuning is {weighted_f1_std}")

weighted_f1 for post train + fine tuning is 0.922
weighted_f1 for post train + fine tuning is 0.027129319932501044


## fine tunig

In [5]:
start = time.time()
i = 0
for train_index, test_index in shuffle_split.split(df.words, df.labels, groups=df.sentence_id):
  df_train = df.loc[train_index,:]
  df_test = df.loc[test_index,:]
  tags = ['O', 'B-C', 'I-C', 'B-E', 'I-E', 'B-CE', 'I-CE']
  model_args = {
      "output_dir": f"electra_outputs_{i}/",
      'best_model_dir': f"electra_outputs_{i}/best_model/",
      "do_lower_case": True,
      "manual_seed": 2020,
      "num_train_epochs": 8,
      "learning_rate": 5e-5,
      "max_seq_length": 128,

      "evaluate_during_training": True, 
      "evaluate_during_training_verbose": True,
      "evaluate_during_training_steps": 4450,
      "save_eval_checkpoints": False,

      "use_early_stopping": True,
      "early_stopping_delta": 0.01,
      "early_stopping_metric": "f1_score",
      "early_stopping_metric_minimize": False,
      "early_stopping_patience": 3,

      'overwrite_output_dir': True,
      'save_optimizer_and_scheduler': False,
      'save_model_every_epoch': False,
      'save_steps': -1,
  }
  labels_ = df_test.groupby(['sentence_id'])['labels'].apply(list).to_list()
  model_ = NERModel('roberta', 'roberta-base', labels=tags, use_cuda = True, args=model_args)
  model_.train_model(df_train, eval_data = df_test)
  result_, model_outputs_, predictions_ = model_.eval_model(df_test)
  ind = []
  for i in range(len(labels_)):
    if len(labels_[i]) == len(predictions_[i]):
      ind.append(i)
  labels_ = np.array(labels_)[ind]
  predictions_ = np.array(predictions_)[ind]
  print(classification_report(labels_, predictions_, 2))
  i += 1
end = time.time()
print(end - start)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]



Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]



              precision    recall  f1-score   support

           C       0.80      0.89      0.84       124
          CE       1.00      0.50      0.67        10
           E       0.80      0.87      0.83       100

   micro avg       0.80      0.86      0.83       234
   macro avg       0.87      0.75      0.78       234
weighted avg       0.81      0.86      0.83       234



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.78      0.88      0.82       114
          CE       1.00      0.67      0.80         6
           E       0.81      0.89      0.85        96

   micro avg       0.79      0.88      0.83       216
   macro avg       0.86      0.81      0.82       216
weighted avg       0.80      0.88      0.83       216



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.88      0.85      0.86       120
          CE       1.00      0.80      0.89         5
           E       0.84      0.86      0.85        95

   micro avg       0.86      0.85      0.86       220
   macro avg       0.91      0.84      0.87       220
weighted avg       0.86      0.85      0.86       220



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.74      0.82      0.78       111
          CE       0.33      0.33      0.33         3
           E       0.75      0.78      0.77        93

   micro avg       0.74      0.80      0.77       207
   macro avg       0.61      0.65      0.63       207
weighted avg       0.74      0.80      0.77       207



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Running Epoch 1 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.79      0.85      0.82       120
          CE       1.00      0.50      0.67         8
           E       0.75      0.84      0.79       101

   micro avg       0.77      0.83      0.80       229
   macro avg       0.85      0.73      0.76       229
weighted avg       0.78      0.83      0.80       229

643.7885053157806


In [6]:
i = 0
for train_index, test_index in shuffle_split.split(df.words, df.labels, groups=df.sentence_id):
  df_train = df.loc[train_index,:]
  df_test = df.loc[test_index,:]
  tags = ['O', 'B-C', 'I-C', 'B-E', 'I-E', 'B-CE', 'I-CE']
  model_args = {
      "output_dir": f"electra_outputs_{i}/",
      'best_model_dir': f"electra_outputs_{i}/best_model/",
      "do_lower_case": True,
      "manual_seed": 2020,
      "num_train_epochs": 8,
      "learning_rate": 5e-5,
      "max_seq_length": 128,

      "evaluate_during_training": True, 
      "evaluate_during_training_verbose": True,
      "evaluate_during_training_steps": 4450,
      "save_eval_checkpoints": False,

      "use_early_stopping": True,
      "early_stopping_delta": 0.01,
      "early_stopping_metric": "f1_score",
      "early_stopping_metric_minimize": False,
      "early_stopping_patience": 3,

      'overwrite_output_dir': True,
      'save_optimizer_and_scheduler': False,
      'save_model_every_epoch': False,
      'save_steps': -1,
  }
  labels_ = df_test.groupby(['sentence_id'])['labels'].apply(list).to_list()
  model_ = NERModel('roberta', f"electra_outputs_{i}/best_model/", labels=tags, use_cuda = True, args=model_args)
  result_, model_outputs_, predictions_ = model_.eval_model(df_test)
  ind = []
  for i in range(len(labels_)):
    if len(labels_[i]) == len(predictions_[i]):
      ind.append(i)
  labels_ = np.array(labels_)[ind]
  predictions_ = np.array(predictions_)[ind]
  print(classification_report(labels_, predictions_, 2))
  i += 1

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]



              precision    recall  f1-score   support

           C       0.80      0.87      0.83       124
          CE       1.00      0.50      0.67        10
           E       0.81      0.87      0.84       100

   micro avg       0.81      0.85      0.83       234
   macro avg       0.87      0.75      0.78       234
weighted avg       0.81      0.85      0.83       234



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.89      0.97      0.93       114
          CE       1.00      0.83      0.91         6
           E       0.88      0.93      0.90        96

   micro avg       0.89      0.95      0.92       216
   macro avg       0.92      0.91      0.91       216
weighted avg       0.89      0.95      0.92       216



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.92      0.93      0.92       120
          CE       1.00      0.80      0.89         5
           E       0.87      0.85      0.86        95

   micro avg       0.90      0.89      0.89       220
   macro avg       0.93      0.86      0.89       220
weighted avg       0.90      0.89      0.89       220



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.90      0.94      0.92       111
          CE       1.00      1.00      1.00         3
           E       0.85      0.87      0.86        93

   micro avg       0.88      0.91      0.90       207
   macro avg       0.92      0.94      0.93       207
weighted avg       0.88      0.91      0.90       207



  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/40 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           C       0.79      0.85      0.82       120
          CE       1.00      0.50      0.67         8
           E       0.75      0.84      0.79       101

   micro avg       0.77      0.83      0.80       229
   macro avg       0.85      0.73      0.76       229
weighted avg       0.78      0.83      0.80       229



In [7]:
weighted_f1 = (0.83 + 0.92 + 0.89 + 0.90 + 0.80) / 5
print(f"weighted_f1 for post train + fine tuning is {weighted_f1}")
weighted_f1_std = np.std([0.83, 0.92, 0.89, 0.90, 0.80])
print(f"weighted_f1 for post train + fine tuning is {weighted_f1_std}")

weighted_f1 for post train + fine tuning is 0.868
weighted_f1 for post train + fine tuning is 0.04534313619501854
