In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from datasets import ClassLabel
from transformers import AutoTokenizer
from tqdm.auto import tqdm
tqdm.pandas()
pd.set_option('display.max_columns', None)

import numpy as np
from transformers import AutoConfig, AutoModelForSequenceClassification
from datasets import Dataset
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_parquet(f"../data/parquet/dataset.parquet")
labels = dataset['target'].unique().tolist()
lconv = ClassLabel(num_classes=len(labels), names=labels)

train = dataset.sample(frac=0.90, random_state=42).reset_index(drop=True)
test = dataset.drop(train.index).reset_index(drop=True)

In [3]:
# Just for test purposes
# train = train.head(10)
# test = test.head(2)

In [3]:
train['target'] = train['target'].apply(lambda x: lconv.str2int(x))
train['text'] = train['text'].astype(str)
train['target'] = train['target'].astype(int)
ds_train = Dataset.from_pandas(train)
print(ds_train)

Dataset({
    features: ['text', 'target'],
    num_rows: 46706
})


In [4]:
model_name = 'bert-base-uncased' 
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/mnt/dmif-nas/SMDC/HF-Cache/")
config = AutoConfig.from_pretrained(model_name, num_labels=len(labels), cache_dir="/mnt/dmif-nas/SMDC/HF-Cache/")
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config, cache_dir="/mnt/dmif-nas/SMDC/HF-Cache/")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
def tokenize_function(examples):
    examples['labels'] = examples['target']
    examples = examples['text']
    tok = tokenizer.batch_encode_plus(examples, padding=True, truncation=True)
    return tok
tokenized_train = ds_train.map(tokenize_function, batched=True)
tokenized_train

  0%|          | 0/47 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'target', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 46706
})

In [7]:
# validation
test['target'] = test['target'].apply(lambda x: lconv.str2int(x))
test['text'] = test['text'].astype(str)
test['target'] = test['target'].astype(int)
ds_test = Dataset.from_pandas(test)
tokenized_val = ds_test.map(tokenize_function, batched=True)
tokenized_val

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'target', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2
})

In [8]:
from sklearn.metrics import mean_squared_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    pred = np.argmax(predictions, axis=-1)
    
    return {
        "accuracy": accuracy_score(y_true=labels, y_pred=pred), 
        "MSE": mean_squared_error(y_true=labels, y_pred=pred),
        #
        "micro_precision": precision_score(y_true=labels, y_pred=pred, average='micro'), 
        "macro_precision": precision_score(y_true=labels, y_pred=pred, average='macro'), 
        "weighted_precision": precision_score(y_true=labels, y_pred=pred, average='weighted'), 
        #
        "micro_recall": recall_score(y_true=labels, y_pred=pred, average='micro'), 
        "macro_recall": recall_score(y_true=labels, y_pred=pred, average='macro'), 
        "weighted_recall": recall_score(y_true=labels, y_pred=pred, average='weighted'), 
        #
        "micro_f1": f1_score(y_true=labels, y_pred=pred, average='micro'), 
        "macro_f1": f1_score(y_true=labels, y_pred=pred, average='macro'), 
        "weighted_f1": f1_score(y_true=labels, y_pred=pred, average='weighted'), 
    }

args = TrainingArguments(
    output_dir='/mnt/dmif-nas/SMDC/HF-tmp/',
    evaluation_strategy = "epoch",
    save_strategy="no",
    num_train_epochs=3,
    no_cuda=False,
    per_device_train_batch_size=2,
)

class CustomTrainer(Trainer):
    def compute_loss(self, *args, **kwargs):
        return super().compute_loss(*args, **kwargs)

trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset= tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, target. If text, target are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 15


Epoch,Training Loss,Validation Loss,Accuracy,Mse,Micro Precision,Macro Precision,Weighted Precision,Micro Recall,Macro Recall,Weighted Recall,Micro F1,Macro F1,Weighted F1
1,No log,1.28586,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,No log,1.029806,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,No log,0.959206,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, target. If text, target are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, target. If text, target are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, target. If text, target are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exam

TrainOutput(global_step=15, training_loss=1.1838915506998697, metrics={'train_runtime': 1.2263, 'train_samples_per_second': 24.464, 'train_steps_per_second': 12.232, 'total_flos': 755436854340.0, 'train_loss': 1.1838915506998697, 'epoch': 3.0})

In [None]:
model_path = "../models/BERT/"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
config.save_pretrained(model_path)