# Fine-tuning a pre-trained Language Model 

In [1]:
import pandas as pd
from sklearn.metrics import f1_score, classification_report
from simpletransformers.classification import (MultiLabelClassificationModel, MultiLabelClassificationArgs,
                                               ClassificationModel, ClassificationArgs)
import warnings
warnings.filterwarnings('ignore')

In [2]:
def bin_other_covid(labels, other_ind = 1):
    if labels[other_ind] == 1:
        return 0
    else:
        return 1

In [3]:
# Configuration

config = {
    "path_to_data": "./labeled_corpus_texts.csv",
    
    # our model on huggingface: https://huggingface.co/sagteam/covid-twitter-xlm-roberta-large/tree/main
    "model": {"model_type": "xlmroberta",
              "model_name": "sagteam/covid-twitter-xlm-roberta-large", },
    
    "args": {"train_batch_size": 32,
             "learning_rate": 4e-05,
             "num_train_epochs": 15,
             "no_save": True,
             "output_dir": "./outputs",
             "overwrite_output_dir": True, 
             "no_save": False,
             "save_optimizer_and_scheduler": False,
             "save_eval_checkpoints": False,
             "save_model_every_epoch": False
             },
    
    # task type: "bin" or "all"
    "task_type": "bin",
}

## Data

In [4]:
ds_dir = config["path_to_data"]

df = pd.read_csv(ds_dir, index_col=0, lineterminator='\n')

df_x_tr = df.query("part=='train'")
df_y_tr = df.query("part=='train'").iloc[:,3:]
df_x_vl = df.query("part=='valid'")
df_y_vl = df.query("part=='valid'").iloc[:,3:]
df_x_ts = df.query("part=='test'")
df_y_ts = df.query("part=='test'").iloc[:,3:]

num_labels = len(df_y_tr.columns)

l_y_column_names = list(df_y_tr.columns)
df_y_tr = df_y_tr[l_y_column_names]
df_y_vl = df_y_vl[l_y_column_names]
df_y_ts = df_y_ts[l_y_column_names]

### Data preparation for simpletransformers

In [5]:
if config["task_type"] == "bin":
    # Binary classification: "other" and "covid"
    other_ind = None
    for column_ind, column_name in enumerate(df_y_tr.columns):
        if column_name == "other":
            other_ind = column_ind
            break
    
    tr = df_x_tr.loc[:, ['text']]
    tr["labels"] = [bin_other_covid(row, other_ind) for row in df_y_tr.values]

    vl = df_x_vl.loc[:, ['text']]
    vl["labels"] = [bin_other_covid(row, other_ind) for row in df_y_vl.values]

    ts = df_x_ts.loc[:, ['text']]
    ts["labels"] = [bin_other_covid(row, other_ind) for row in df_y_ts.values]

else:
    # Classification by all classes
    tr = df_x_tr.loc[:, ['text']]
    tr["labels"] = [list(row) for row in df_y_tr.values]

    vl = df_x_vl.loc[:, ['text']]
    vl["labels"] = [list(row) for row in df_y_vl.values]

    ts = df_x_ts.loc[:, ['text']]
    ts["labels"] = [list(row) for row in df_y_ts.values]

## Model

In [6]:
d_scores = {}

if config["task_type"] == "all":
    column_names = l_y_column_names
    model_args = MultiLabelClassificationArgs(**config["args"])
    model = MultiLabelClassificationModel(config["model"]["model_type"], config["model"]["model_name"],
                                          num_labels=num_labels, args=model_args)
elif config["task_type"] == "bin":
    column_names = ["other", "potential_covid"]
    model_args = ClassificationArgs(**config["args"])
    model = ClassificationModel(config["model"]["model_type"], config["model"]["model_name"],
                                num_labels=2, args=model_args)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=645.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2240744018.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at sagteam/covid-twitter-xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at sagteam/covid-twitter-xlm-roberta-large and are newly initialized: ['classi

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=151.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=288.0, style=ProgressStyle(description_…




In [7]:
# training model

model.train_model(tr, eval_df=vl)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=15.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 15', max=185.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 10 of 15', max=185.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 11 of 15', max=185.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 12 of 15', max=185.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 13 of 15', max=185.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 14 of 15', max=185.0, style=ProgressStyle(d…





(2775, 0.03802280801402884)

In [8]:
# load best model
"""
if config["task_type"] == "all":
    model = MultiLabelClassificationModel(config["model"]["model_type"], "./outputs",
                                          num_labels=num_labels, args=model_args)
elif config["task_type"] == "bin":
    model = ClassificationModel(config["model"]["model_type"], "./outputs", 
                                num_labels=2, args=model_args)
"""

'\nif config["task_type"] == "all":\n    model = MultiLabelClassificationModel(config["model"]["model_type"], "./outputs",\n                                          num_labels=num_labels, args=model_args)\nelif config["task_type"] == "bin":\n    model = ClassificationModel(config["model"]["model_type"], "./outputs", \n                                num_labels=2, args=model_args)\n'

In [9]:
# prediction and scoring

pred, raw_outputs = model.predict(list(tr["text"].values))

print("Accuracy on the training set:")
print(classification_report(tr["labels"].tolist(), pred, target_names=column_names))
d_scores["tr"] = {"f1_macro": f1_score(tr["labels"].tolist(), pred, average="macro"),
                  "classification_report": classification_report(tr["labels"].tolist(), pred, target_names=column_names),
                  "d_classfification_report": classification_report(tr["labels"].tolist(), pred, target_names=column_names, output_dict=True)}

pred, raw_outputs = model.predict(list(vl["text"].values))

print("Accuracy on the validation set:")
print(classification_report(vl["labels"].tolist(), pred, target_names=column_names))
d_scores["vl"] = {"f1_macro": f1_score(vl["labels"].tolist(), pred, average="macro"),
                  "classification_report": classification_report(vl["labels"].tolist(), pred, target_names=column_names),
                  "d_classfification_report": classification_report(vl["labels"].tolist(), pred, target_names=column_names, output_dict=True)}

pred, raw_outputs = model.predict(list(ts["text"].values))

print("Accuracy on a test set:")
print(classification_report(ts["labels"].tolist(), pred, target_names=column_names))
d_scores["ts"] = {"f1_macro": f1_score(ts["labels"].tolist(), pred, average="macro"),
                  "classification_report": classification_report(ts["labels"].tolist(), pred, target_names=column_names),
                  "d_classfification_report": classification_report(ts["labels"].tolist(), pred, target_names=column_names, output_dict=True)}

HBox(children=(FloatProgress(value=0.0, max=5916.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=740.0), HTML(value='')))


Accuracy on the training set:
                 precision    recall  f1-score   support

          other       1.00      1.00      1.00      5322
potential_covid       1.00      1.00      1.00       594

       accuracy                           1.00      5916
      macro avg       1.00      1.00      1.00      5916
   weighted avg       1.00      1.00      1.00      5916



HBox(children=(FloatProgress(value=0.0, max=1972.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Accuracy on the validation set:
                 precision    recall  f1-score   support

          other       0.97      0.96      0.96      1757
potential_covid       0.70      0.73      0.71       215

       accuracy                           0.94      1972
      macro avg       0.83      0.84      0.84      1972
   weighted avg       0.94      0.94      0.94      1972



HBox(children=(FloatProgress(value=0.0, max=1972.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Accuracy on a test set:
                 precision    recall  f1-score   support

          other       0.97      0.96      0.97      1769
potential_covid       0.69      0.72      0.71       203

       accuracy                           0.94      1972
      macro avg       0.83      0.84      0.84      1972
   weighted avg       0.94      0.94      0.94      1972



In [10]:
print("Successful complete")

Successful complete
