In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# *Installing Required Libraries*

In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [3]:
%cd '/content/drive/MyDrive/'

/content/drive/MyDrive


In [2]:
!unzip /content/data.zip

Archive:  /content/data.zip
  inflating: data/val.csv            
  inflating: data/train.csv          
  inflating: data/test.csv           
  inflating: data/final_val.csv      
  inflating: data/final_test.csv     
  inflating: data/final_train.csv    


# *Importing libraries*

In [3]:
import json
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch
from datasets import Dataset
from datasets import load_dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForMaskedLM,AutoModelForSequenceClassification
from transformers import TrainerCallback, EarlyStoppingCallback
from transformers import TrainingArguments, Trainer

In [4]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
val_df = pd.read_csv("data/val.csv")


train_df = train_df.drop(columns=['Num_Tokens','Num_Sentences'])
test_df = test_df.drop(columns=['Num_Tokens','Num_Sentences'])
val_df = val_df.drop(columns=['Num_Tokens','Num_Sentences'])

In [5]:
train_df['classification_binary'].value_counts()

classification_binary
0    1137
1     428
Name: count, dtype: int64

In [6]:
val_df['classification_binary'].value_counts()

classification_binary
0    379
1    143
Name: count, dtype: int64

In [7]:
test_df['classification_binary'].value_counts()

classification_binary
0    380
1    143
Name: count, dtype: int64

In [9]:
train_df.to_csv('data/final_train.csv',index=False, encoding='utf-8')
test_df.to_csv('data/final_test.csv',index=False, encoding='utf-8')
val_df.to_csv('data/final_val.csv',index=False, encoding='utf-8')

## *Dataset loading as HuggingFace Trainer*

In [10]:
dataset = load_dataset('csv', data_files={'train': "data/final_train.csv",
                                              'val':"data/final_val.csv",'test':"data/final_test.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [11]:
dataset['train'][0]

{'clean_text': 'matlab ek naale saaf karne wale ka beta jo kabhi coaching nhi gaya , jiske pass saare subhidaye nhi hai kya wo kabhi naukri na kare kyuki uuske pass merit nhi hai reservation isiliye hai taki pichdi jati ko uper uthaya ja sake iisme merit maayne nhi rahkta',
 'classification_binary': 0}

In [12]:
dataset['val'][0]

{'clean_text': 'didi ka baat toh sunega hi kiu khi raheneka jyga diya, aadhar bana diyakhudh ka seat bachane ke liye pahele se rahene wala ki izzat le raha hai, khun kar raha hai, harmad ghusha raha hai, terror ka khuf felane ke liye',
 'classification_binary': 0}

In [13]:
dataset['test'][0]

{'clean_text': 'mein jatiwad k paksh m nahi hu per,adhyapak mahodaya ko duniya k anubhav h,do alag alag jati ki alag sankranti hoti h jo baad m var vadhu aur pariwar walon k liye dikhayi hoti h ,ye bat ek jati k do pariwaron me bhi ho skti h,but apni jaati mein ,bhinnata on k khtra km hota h',
 'classification_binary': 0}

# *Tokenization as per Hing-MBERT*

In [14]:
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-mbert")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [15]:
mbert_dataset = dataset.map(
    lambda example: tokenizer(example['clean_text'], max_length=74, padding='max_length', truncation=True),
    batched=True,
    batch_size=64
)
mbert_dataset = mbert_dataset.remove_columns(["clean_text"])
mbert_dataset = mbert_dataset.rename_column("classification_binary", "labels")
mbert_dataset.set_format("torch")

Map:   0%|          | 0/1565 [00:00<?, ? examples/s]

Map:   0%|          | 0/522 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

In [16]:
mbert_dataset['train'][0]

{'labels': tensor(0),
 'input_ids': tensor([   101,  17255,  41284,  16334,  10132,  12223,  31659,  10575, 107188,
          11471,  10284,  10730,  26249,  12541,  10730,  39554,  10116,  57714,
            182,  11924,  45365,    117,  28882,  10550,  23392,  31659,  10246,
          13987,  63132,  12871,    182,  11924,  13080,  87147,  10113,  12796,
          10730,  39554,  10116,  53041,  10401,  10132,  25085,  10112,  87147,
          39821,    189, 107320,  10112,  23392,  94992,  10123,    182,  11924,
          13080,  44967,  10822,  10124,  13784,  12871,  13080,  74628,  24109,
          10269,  10703,  10201,  10325,  11252,  10741,  10165,  11735,  59562,
          10201,    102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]),
 'attention_

In [17]:
print("Batch Decode:")
print(tokenizer.batch_decode(mbert_dataset['train'][0]['input_ids']))

Batch Decode:
['[CLS]', 'mat', '##lab', 'ek', 'na', '##ale', 'saa', '##f', 'karne', 'wa', '##le', 'ka', 'beta', 'jo', 'ka', '##bh', '##i', 'coaching', 'n', '##hi', 'gaya', ',', 'jis', '##ke', 'pass', 'saa', '##re', 'sub', '##hida', '##ye', 'n', '##hi', 'hai', 'ky', '##a', 'wo', 'ka', '##bh', '##i', 'nauk', '##ri', 'na', 'kar', '##e', 'ky', '##uki', 'u', '##usk', '##e', 'pass', 'meri', '##t', 'n', '##hi', 'hai', 'reserva', '##tion', 'is', '##ili', '##ye', 'hai', 'taki', 'pi', '##ch', '##di', 'ja', '##ti', 'ko', 'up', '##er', 'ut', '##haya', 'ja', '[SEP]']


In [18]:
cls = [tokenizer.cls_token_id]
sep = [tokenizer.sep_token_id]

input_str = train_df.iloc[0]['clean_text']
# Tokenization happens in a few steps:
input_tokens = tokenizer.tokenize(input_str)
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
input_ids_special_tokens = cls + input_ids + sep

decoded_str = tokenizer.decode(input_ids_special_tokens)

print("start:                ", input_str)
print("tokenize:             ", input_tokens)
print("convert_tokens_to_ids:", input_ids)
print("add special tokens:   ", input_ids_special_tokens)
print("--------")
print("decode:               ", decoded_str)

start:                 matlab ek naale saaf karne wale ka beta jo kabhi coaching nhi gaya , jiske pass saare subhidaye nhi hai kya wo kabhi naukri na kare kyuki uuske pass merit nhi hai reservation isiliye hai taki pichdi jati ko uper uthaya ja sake iisme merit maayne nhi rahkta
tokenize:              ['mat', '##lab', 'ek', 'na', '##ale', 'saa', '##f', 'karne', 'wa', '##le', 'ka', 'beta', 'jo', 'ka', '##bh', '##i', 'coaching', 'n', '##hi', 'gaya', ',', 'jis', '##ke', 'pass', 'saa', '##re', 'sub', '##hida', '##ye', 'n', '##hi', 'hai', 'ky', '##a', 'wo', 'ka', '##bh', '##i', 'nauk', '##ri', 'na', 'kar', '##e', 'ky', '##uki', 'u', '##usk', '##e', 'pass', 'meri', '##t', 'n', '##hi', 'hai', 'reserva', '##tion', 'is', '##ili', '##ye', 'hai', 'taki', 'pi', '##ch', '##di', 'ja', '##ti', 'ko', 'up', '##er', 'ut', '##haya', 'ja', 'sa', '##ke', 'ii', '##sme', 'meri', '##t', 'maa', '##yne', 'n', '##hi', 'ra', '##h', '##kta']
convert_tokens_to_ids: [17255, 41284, 16334, 10132, 12223, 31659, 10575, 

# *Calculating Weights for Labels with Imbalanced Dataset*

In [19]:
# Define all possible class labels
class_labels = np.unique(mbert_dataset['train']['labels'])

# Calculate class weights
labels = mbert_dataset['train']['labels']
class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=labels.numpy())
class_weights = torch.tensor(class_weights, dtype=torch.float)
print(class_weights)

tensor([0.6882, 1.8283])


# **Weighted Loss Trainer for Imbalanced Datset**

In [20]:
from transformers import TrainingArguments, Trainer
# Define a custom Trainer class to include class weights
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to(model.device)  # Ensure labels are on the same device as model
        inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Ensure all inputs are on the same device as model
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Compute weighted loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))  # Move class_weights to the same device as model
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# *Function: Compute metrics for all labels*

In [21]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)

    # Additional metrics
    recall_micro = recall_score(labels, predictions, average='micro')
    f1_micro = f1_score(labels, predictions, average='micro')

    recall_macro = recall_score(labels, predictions, average='macro')
    f1_macro = f1_score(labels, predictions, average='macro')

    recall_positive = recall_score(labels, predictions, pos_label=1)
    f1_positive = f1_score(labels, predictions, pos_label=1)

    recall_negative = recall_score(labels, predictions, pos_label=0)
    f1_negative = f1_score(labels, predictions, pos_label=0)

    return {
        'accuracy': accuracy,
        'precision_weighted': precision,
        'recall_weighted': recall,
        'f1_weighted': f1,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'recall_positive': recall_positive,
        'f1_positive': f1_positive,
        'recall_negative': recall_negative,
        'f1_negative': f1_negative
    }

In [25]:
model = AutoModelForSequenceClassification.from_pretrained("l3cube-pune/hing-mbert", num_labels=2)

arguments = TrainingArguments(
    output_dir="sample_HingMBert_trainer",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    optim = 'adamw_torch',
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    learning_rate=5e-5,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',  # Define the metric for early stopping
    greater_is_better=False,  # Set to False because we want to minimize the loss
    save_total_limit=1,
    seed=224
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)

    # Additional metrics
    recall_micro = recall_score(labels, predictions, average='micro')
    f1_micro = f1_score(labels, predictions, average='micro')

    recall_macro = recall_score(labels, predictions, average='macro')
    f1_macro = f1_score(labels, predictions, average='macro')

    recall_positive = recall_score(labels, predictions, pos_label=1)
    f1_positive = f1_score(labels, predictions, pos_label=1)

    recall_negative = recall_score(labels, predictions, pos_label=0)
    f1_negative = f1_score(labels, predictions, pos_label=0)

    return {
        'accuracy': accuracy,
        'precision_weighted': precision,
        'recall_weighted': recall,
        'f1_weighted': f1,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'recall_positive': recall_positive,
        'f1_positive': f1_positive,
        'recall_negative': recall_negative,
        'f1_negative': f1_negative
    }


trainer = WeightedLossTrainer(
    model=model,
    args=arguments,
    train_dataset=mbert_dataset['train'],
    eval_dataset=mbert_dataset['val'], # change to test when you do your final evaluation!
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hing-mbert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# *Logger Function for early Stopping and logging*

In [22]:
class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path
    # will call on_log on each logging step, specified by TrainerArguement. (i.e TrainerArguement.logginng_step)
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")


In [27]:
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01))
trainer.add_callback(LoggingCallback("sample_HingMBert_trainer/log.jsonl"))

In [28]:
# train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted,Recall Micro,F1 Micro,Recall Macro,F1 Macro,Recall Positive,F1 Positive,Recall Negative,F1 Negative
1,No log,0.68992,0.450192,0.686927,0.450192,0.448745,0.450192,0.450192,0.569118,0.450173,0.832168,0.453333,0.306069,0.447013
2,No log,0.673161,0.714559,0.696311,0.714559,0.702817,0.714559,0.714559,0.607478,0.61342,0.370629,0.415686,0.844327,0.811153
3,No log,0.578643,0.704981,0.738882,0.704981,0.716207,0.704981,0.704981,0.68144,0.66101,0.629371,0.538922,0.733509,0.783099
4,No log,0.575555,0.664751,0.746379,0.664751,0.682673,0.664751,0.664751,0.686394,0.63995,0.734266,0.545455,0.638522,0.734446
5,No log,0.576984,0.749042,0.769543,0.749042,0.756311,0.749042,0.749042,0.720492,0.704326,0.657343,0.589342,0.783641,0.81931


TrainOutput(global_step=125, training_loss=0.58895361328125, metrics={'train_runtime': 222.0862, 'train_samples_per_second': 70.468, 'train_steps_per_second': 1.126, 'train_loss': 0.58895361328125, 'epoch': 5.0})

In [29]:
mbert_results = trainer.evaluate()

In [30]:
print(mbert_results)

{'eval_loss': 0.5755548477172852, 'eval_accuracy': 0.6647509578544061, 'eval_precision_weighted': 0.7463788127581232, 'eval_recall_weighted': 0.6647509578544061, 'eval_f1_weighted': 0.682672573677754, 'eval_recall_micro': 0.6647509578544061, 'eval_f1_micro': 0.6647509578544061, 'eval_recall_macro': 0.6863940808531838, 'eval_f1_macro': 0.6399503379776521, 'eval_recall_positive': 0.7342657342657343, 'eval_f1_positive': 0.5454545454545454, 'eval_recall_negative': 0.6385224274406333, 'eval_f1_negative': 0.7344461305007588, 'eval_runtime': 2.6212, 'eval_samples_per_second': 199.145, 'eval_steps_per_second': 12.59, 'epoch': 5.0}


# **Hing-MBERT Test results**

In [31]:
test_results_mbert = trainer.predict(mbert_dataset['test'])

In [32]:
print(test_results_mbert.metrics)

{'test_loss': 0.5324915647506714, 'test_accuracy': 0.7093690248565966, 'test_precision_weighted': 0.7973572704237559, 'test_recall_weighted': 0.7093690248565966, 'test_f1_weighted': 0.7251108558216206, 'test_recall_micro': 0.7093690248565966, 'test_f1_micro': 0.7093690248565966, 'test_recall_macro': 0.7498435774751564, 'test_f1_macro': 0.6899144979092554, 'test_recall_positive': 0.8391608391608392, 'test_f1_positive': 0.6122448979591837, 'test_recall_negative': 0.6605263157894737, 'test_f1_negative': 0.7675840978593271, 'test_runtime': 2.3734, 'test_samples_per_second': 220.355, 'test_steps_per_second': 13.904}


# **Hing BERT Training**

In [33]:
## dataset pre processing and alignment
# dataset = load_dataset('csv', data_files={'train': "data/final_train.csv",
#                                               'val':"data/final_val.csv",'test':"data/final_test.csv"})

bert_tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-bert")

bert_dataset = dataset.map(
    lambda example: bert_tokenizer(example['clean_text'], max_length=74, padding='max_length', truncation=True),
    batched=True,
    batch_size=64
)

bert_dataset = bert_dataset.remove_columns(["clean_text"])
bert_dataset = bert_dataset.rename_column("classification_binary", "labels")
bert_dataset.set_format("torch")

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/1565 [00:00<?, ? examples/s]

Map:   0%|          | 0/522 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

In [38]:
## model training
bert_model = AutoModelForSequenceClassification.from_pretrained("l3cube-pune/hing-bert")

arguments = TrainingArguments(
    output_dir="sample_Hing_Bert_trainer",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    optim = 'adamw_torch',
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    learning_rate=5e-5,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',  # Define the metric for early stopping
    greater_is_better=False,  # Set to False because we want to minimize the loss
    save_total_limit=1,
    seed=224
)

bert_trainer = WeightedLossTrainer(
    model=bert_model,
    args=arguments,
    train_dataset=bert_dataset['train'],
    eval_dataset=bert_dataset['val'], # change to test when you do your final evaluation!
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics
)

bert_trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01))
bert_trainer.add_callback(LoggingCallback("sample_Hing_Bert_trainer/log.jsonl"))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hing-bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# train the model
bert_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted,Recall Micro,F1 Micro,Recall Macro,F1 Macro,Recall Positive,F1 Positive,Recall Negative,F1 Negative
1,No log,0.693618,0.413793,0.610138,0.413793,0.419579,0.413793,0.413793,0.507039,0.413483,0.713287,0.4,0.300792,0.426966
2,No log,0.689493,0.659004,0.657518,0.659004,0.658252,0.659004,0.659004,0.56922,0.569514,0.370629,0.373239,0.76781,0.765789
3,No log,0.676104,0.505747,0.696493,0.505747,0.519467,0.505747,0.505747,0.594313,0.503114,0.79021,0.466942,0.398417,0.539286
4,No log,0.611156,0.611111,0.714373,0.611111,0.632033,0.611111,0.611111,0.640746,0.590541,0.706294,0.498765,0.575198,0.682316
5,No log,0.62401,0.735632,0.747905,0.735632,0.740689,0.735632,0.735632,0.689485,0.681014,0.587413,0.54902,0.791557,0.813008
6,No log,0.651005,0.712644,0.746138,0.712644,0.723579,0.712644,0.712644,0.691071,0.669815,0.643357,0.550898,0.738786,0.788732


TrainOutput(global_step=150, training_loss=0.5976695760091146, metrics={'train_runtime': 203.3995, 'train_samples_per_second': 76.942, 'train_steps_per_second': 1.229, 'train_loss': 0.5976695760091146, 'epoch': 6.0})

In [40]:
bert_eval_results = bert_trainer.evaluate()
print(bert_eval_results)

{'eval_loss': 0.6111559867858887, 'eval_accuracy': 0.6111111111111112, 'eval_precision_weighted': 0.7143733281137861, 'eval_recall_weighted': 0.6111111111111112, 'eval_f1_weighted': 0.6320330763731911, 'eval_recall_micro': 0.6111111111111112, 'eval_f1_micro': 0.6111111111111112, 'eval_recall_macro': 0.6407457977378821, 'eval_f1_macro': 0.5905407755173013, 'eval_recall_positive': 0.7062937062937062, 'eval_f1_positive': 0.4987654320987654, 'eval_recall_negative': 0.575197889182058, 'eval_f1_negative': 0.6823161189358373, 'eval_runtime': 2.7179, 'eval_samples_per_second': 192.061, 'eval_steps_per_second': 12.142, 'epoch': 6.0}


# **Test results for Hing-BERT**

In [41]:
bert_results = bert_trainer.predict(bert_dataset['test'])
print(bert_results.metrics)

{'test_loss': 0.6007221341133118, 'test_accuracy': 0.6424474187380497, 'test_precision_weighted': 0.7525145613737633, 'test_recall_weighted': 0.6424474187380497, 'test_f1_weighted': 0.6616275867081143, 'test_recall_micro': 0.6424474187380497, 'test_f1_micro': 0.6424474187380497, 'test_recall_macro': 0.6863452337136547, 'test_f1_macro': 0.6252619882368717, 'test_recall_positive': 0.7832167832167832, 'test_f1_positive': 0.5450121654501217, 'test_recall_negative': 0.5894736842105263, 'test_f1_negative': 0.7055118110236219, 'test_runtime': 2.5326, 'test_samples_per_second': 206.507, 'test_steps_per_second': 13.03}


# **Muril cased training**

In [34]:
muril_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

muril_dataset = dataset.map(
    lambda example: muril_tokenizer(example['clean_text'], max_length=74, padding='max_length', truncation=True),
    batched=True,
    batch_size=16
)

muril_dataset = muril_dataset.remove_columns(["clean_text"])
muril_dataset = muril_dataset.rename_column("classification_binary", "labels")
muril_dataset.set_format("torch")

muril_model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased",num_labels=2)

arguments = TrainingArguments(
    output_dir="sample_Hing_muril_trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy= "epoch",
    learning_rate=2e-5,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,               # strength of weight decay
    load_best_model_at_end=True,
     metric_for_best_model='eval_loss',  # Define the metric for early stopping
    greater_is_better=False,  # Set to False because we want to minimize the loss
    save_total_limit=1,
    seed=224
)

muril_trainer = WeightedLossTrainer(
    model=muril_model,
    args=arguments,
    train_dataset=muril_dataset['train'],
    eval_dataset=muril_dataset['val'], # change to test when you do your final evaluation!
    tokenizer=muril_tokenizer,
    compute_metrics=compute_metrics
)

muril_trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01))
muril_trainer.add_callback(LoggingCallback("sample_Hing_muril_trainer/log.jsonl"))

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Map:   0%|          | 0/1565 [00:00<?, ? examples/s]

Map:   0%|          | 0/522 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
muril_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted,Recall Micro,F1 Micro,Recall Macro,F1 Macro,Recall Positive,F1 Positive,Recall Negative,F1 Negative
1,No log,0.692757,0.726054,0.527154,0.726054,0.61082,0.726054,0.726054,0.5,0.420644,0.0,0.0,1.0,0.841287
2,No log,0.690142,0.727969,0.67883,0.727969,0.63743,0.727969,0.727969,0.518737,0.470497,0.055944,0.101266,0.98153,0.839729
3,No log,0.646728,0.697318,0.743066,0.697318,0.7108,0.697318,0.697318,0.687049,0.659483,0.664336,0.545977,0.709763,0.772989
4,No log,0.618796,0.672414,0.749764,0.672414,0.689785,0.672414,0.672414,0.691671,0.646631,0.734266,0.551181,0.649077,0.742081
5,No log,0.603219,0.750958,0.756608,0.750958,0.7535,0.750958,0.750958,0.697862,0.693507,0.58042,0.560811,0.815303,0.826203
6,0.646200,0.686077,0.768199,0.749581,0.768199,0.743265,0.768199,0.768199,0.637886,0.652733,0.34965,0.452489,0.926121,0.852977
7,0.646200,0.676521,0.727969,0.730365,0.727969,0.729124,0.727969,0.727969,0.662435,0.661008,0.517483,0.510345,0.807388,0.811671


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=686, training_loss=0.601743992841626, metrics={'train_runtime': 455.0917, 'train_samples_per_second': 34.389, 'train_steps_per_second': 2.153, 'train_loss': 0.601743992841626, 'epoch': 7.0})

In [36]:
muril_eval_results = muril_trainer.evaluate()
print(muril_eval_results)

{'eval_loss': 0.6032189130783081, 'eval_accuracy': 0.7509578544061303, 'eval_precision_weighted': 0.7566075615680655, 'eval_recall_weighted': 0.7509578544061303, 'eval_f1_weighted': 0.7534999271814687, 'eval_recall_micro': 0.7509578544061303, 'eval_f1_micro': 0.7509578544061303, 'eval_recall_macro': 0.697861505249368, 'eval_f1_macro': 0.6935070096834803, 'eval_recall_positive': 0.5804195804195804, 'eval_f1_positive': 0.5608108108108109, 'eval_recall_negative': 0.8153034300791556, 'eval_f1_negative': 0.8262032085561497, 'eval_runtime': 2.3246, 'eval_samples_per_second': 224.554, 'eval_steps_per_second': 14.196, 'epoch': 7.0}


In [37]:
muril_results = muril_trainer.predict(muril_dataset['test'])
print(muril_results.metrics)

{'test_loss': 0.5946338772773743, 'test_accuracy': 0.7609942638623327, 'test_precision_weighted': 0.7625806067926714, 'test_recall_weighted': 0.7609942638623327, 'test_f1_weighted': 0.7617611867768087, 'test_recall_micro': 0.7609942638623327, 'test_f1_micro': 0.7609942638623327, 'test_recall_macro': 0.7025027603974973, 'test_f1_macro': 0.7011742765332103, 'test_recall_positive': 0.5734265734265734, 'test_f1_positive': 0.5674740484429066, 'test_recall_negative': 0.8315789473684211, 'test_f1_negative': 0.8348745046235139, 'test_runtime': 2.327, 'test_samples_per_second': 224.752, 'test_steps_per_second': 14.181}


In [23]:
xlmr_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

xlmr_dataset = dataset.map(
    lambda example: xlmr_tokenizer(example['clean_text'], max_length=74, padding='max_length', truncation=True),
    batched=True,
    batch_size=16
)

xlmr_dataset = xlmr_dataset.remove_columns(["clean_text"])
xlmr_dataset = xlmr_dataset.rename_column("classification_binary", "labels")
xlmr_dataset.set_format("torch")

xlmr_model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base",num_labels=2)

arguments = TrainingArguments(
    output_dir="sample_Hing_xlmr_trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy= "epoch",
    learning_rate=2e-5,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,               # strength of weight decay
    load_best_model_at_end=True,
     metric_for_best_model='eval_loss',  # Define the metric for early stopping
    greater_is_better=False,  # Set to False because we want to minimize the loss
    save_total_limit=1,
    seed=224
)

xlmr_trainer = WeightedLossTrainer(
    model=xlmr_model,
    args=arguments,
    train_dataset=xlmr_dataset['train'],
    eval_dataset=xlmr_dataset['val'], # change to test when you do your final evaluation!
    tokenizer=xlmr_tokenizer,
    compute_metrics=compute_metrics
)

xlmr_trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01))
xlmr_trainer.add_callback(LoggingCallback("sample_Hing_xlmr_trainer/log.jsonl"))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/1565 [00:00<?, ? examples/s]

Map:   0%|          | 0/522 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
import torch
torch.cuda.empty_cache()

In [25]:
xlmr_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted,Recall Micro,F1 Micro,Recall Macro,F1 Macro,Recall Positive,F1 Positive,Recall Negative,F1 Negative
1,No log,0.694281,0.273946,0.075047,0.273946,0.117818,0.273946,0.273946,0.5,0.215038,1.0,0.430075,0.0,0.0
2,No log,0.691026,0.659004,0.656028,0.659004,0.65748,0.659004,0.659004,0.567042,0.567599,0.363636,0.368794,0.770449,0.766404
3,No log,0.66865,0.747126,0.724603,0.747126,0.727146,0.747126,0.747126,0.625551,0.636467,0.356643,0.435897,0.894459,0.837037
4,No log,0.647938,0.653257,0.719805,0.653257,0.670935,0.653257,0.653257,0.654529,0.620668,0.657343,0.509485,0.651715,0.731852
5,No log,0.661928,0.699234,0.722029,0.699234,0.707946,0.699234,0.699234,0.657887,0.645643,0.566434,0.507837,0.74934,0.783448
6,0.679100,0.646642,0.641762,0.738492,0.641762,0.66111,0.641762,0.641762,0.67274,0.620703,0.741259,0.531328,0.604222,0.710078


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=588, training_loss=0.6669375750483298, metrics={'train_runtime': 399.216, 'train_samples_per_second': 39.202, 'train_steps_per_second': 2.455, 'train_loss': 0.6669375750483298, 'epoch': 6.0})

In [26]:
xlmr_eval_results = xlmr_trainer.evaluate()
print(xlmr_eval_results)

{'eval_loss': 0.6466418504714966, 'eval_accuracy': 0.6417624521072797, 'eval_precision_weighted': 0.7384921310099962, 'eval_recall_weighted': 0.6417624521072797, 'eval_f1_weighted': 0.661109827049134, 'eval_recall_micro': 0.6417624521072797, 'eval_f1_micro': 0.6417624521072797, 'eval_recall_macro': 0.6727401885713231, 'eval_f1_macro': 0.6207029200909249, 'eval_recall_positive': 0.7412587412587412, 'eval_f1_positive': 0.531328320802005, 'eval_recall_negative': 0.604221635883905, 'eval_f1_negative': 0.7100775193798449, 'eval_runtime': 2.2545, 'eval_samples_per_second': 231.54, 'eval_steps_per_second': 14.638, 'epoch': 6.0}


In [27]:
xlmr_results = xlmr_trainer.predict(xlmr_dataset['test'])
print(xlmr_results.metrics)

{'test_loss': 0.5917826294898987, 'test_accuracy': 0.6673040152963671, 'test_precision_weighted': 0.7561091948427616, 'test_recall_weighted': 0.6673040152963671, 'test_f1_weighted': 0.6853242691642236, 'test_recall_micro': 0.6673040152963671, 'test_f1_micro': 0.6673040152963671, 'test_recall_macro': 0.6969083548030917, 'test_f1_macro': 0.6450337015540161, 'test_recall_positive': 0.7622377622377622, 'test_f1_positive': 0.5561224489795918, 'test_recall_negative': 0.631578947368421, 'test_f1_negative': 0.7339449541284403, 'test_runtime': 2.2418, 'test_samples_per_second': 233.294, 'test_steps_per_second': 14.72}
