# Installations

In [None]:
!pip install transformers datasets scikit-learn evaluate tqdm kaggle fairscale huggingface_hub -Uqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# ***Fine-Tuning***

# Step 1: Setup
Get environment variables & secrets, set configuration, download data and model artifacts.

In [None]:
from google.colab import userdata
import os

from huggingface_hub import login
import kagglehub

from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import numpy as np

import torch

from transformers import (
    TrainerCallback,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

import evaluate


from functools import partial

import random

In [None]:
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USR')
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

login(token = os.getenv('HF_TOKEN'))
data_path = kagglehub.dataset_download("zynicide/wine-reviews")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Downloading from https://www.kaggle.com/api/v1/datasets/download/zynicide/wine-reviews?dataset_version_number=4...


100%|██████████| 50.9M/50.9M [00:03<00:00, 16.1MB/s]

Extracting files...





### Define Configuration

In [None]:
train_bsz, val_bsz = 32, 32 #batch sizes for training and validation
lr = 10e-5 #learning rate
#betas = (0.9, 0.98) #for Adam optimizer
betas = (0.8, 0.8) #for Adam optimizer
n_epochs = 5 #training rounds
eps = 1e-6 #to prevent division by 0
warm_up_ratio = 0.15 #gradual learning rate increase from 0 to target LR during first warm_up_ratio% of training steps, helps stabilize early training by preventing large gradient updates with random initialization
max_grad_norm = 1.0 #prevent exploding gradients

params = {}

params['train_bsz'] = train_bsz
params['val_bsz'] = val_bsz
params['lr'] = lr
params['betas'] = betas
params['n_epochs'] = n_epochs
params['eps'] = eps
params['warm_up_ratio'] = warm_up_ratio
params['max_grad_norm'] = max_grad_norm

class CFG:
  file_1_path = f"{data_path}/winemag-data_first150k.csv"
  file_2_path = f"{data_path}/winemag-data-130k-v2.csv"
  columns = [
    'country',
    'description',
    'designation',
    'points',
    'price',
    'province',
    'region_1',
    'region_2',
    'variety',
    'winery'
    ]

  random_state = 1

  id2label = {0: 'bad', 1: 'average', 2: 'good', 3: 'excellent'}
  label2id = {'bad': 0, 'average': 1, 'good': 2, 'excellent': 3}
  num_labels = len(id2label)

  checkpoint = 'answerdotai/ModernBERT-base'

  hf_repository_id = 'ModernBERT_wine_quality_reviews_ft'

  params = params

  output_dir = os.path.join('output_data', 'ModernBERT_wine_quality_reviews_ft')
  output_data_path = 'output_data'

In [None]:
config = CFG()

## Define some functions

In [None]:
def preprocessing(batch, tokenizer):
    labels = [x for x in batch['label']]
    token_dict = tokenizer(batch['text'], truncation = False)
    return dict({'input_ids': token_dict.input_ids, 'label':labels})


"""
#augmentation
def preprocessing(batch, tokenizer, drop_prob=0.01):
    texts = [x for x in batch['text']]

    # Apply random word dropping during training
    augmented_texts = []
    for text in texts:
        words = text.split()
        kept_words = [word for word in words if random.random() > drop_prob]
        augmented_texts.append(' '.join(kept_words))

    labels = [x for x in batch['label']]
    token_dict = tokenizer(augmented_texts, truncation=False)
    return dict({'input_ids': token_dict.input_ids, 'label':labels})

"""

"\n#augmentation\ndef preprocessing(batch, tokenizer, drop_prob=0.01):\n    texts = [x for x in batch['text']]\n\n    # Apply random word dropping during training\n    augmented_texts = []\n    for text in texts:\n        words = text.split()\n        kept_words = [word for word in words if random.random() > drop_prob]\n        augmented_texts.append(' '.join(kept_words))\n\n    labels = [x for x in batch['label']]\n    token_dict = tokenizer(augmented_texts, truncation=False)\n    return dict({'input_ids': token_dict.input_ids, 'label':labels})\n\n"

In [None]:
def bin_label(x):
    if x <= 85:
        return 0
    if x > 85 and x <= 88:
        return 1
    if x > 88 and x <= 92:
        return 2
    if x > 92:
        return 3

In [None]:
class MetricsCallback(TrainerCallback):
  def __init__(self):
    self.training_history = {"train": [], "eval": []}

  def on_log(self, args, state, control, logs=None, **kwargs):
    if logs is not None:
      if "loss" in logs:  # Training logs
        self.training_history["train"].append(logs)
      elif "eval_loss" in logs:  # Evaluation logs
        self.training_history["eval"].append(logs)

In [None]:
def compute_metrics(eval_pred):
    load_accuracy = evaluate.load("accuracy")
    load_f1 = evaluate.load("f1")

    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
    predictions = np.argmax(logits, axis=-1)

    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average='weighted')["f1"]

    return {"accuracy": accuracy, "f1": f1}

## Download model artifacts

In [None]:
#Define the tokenizer and data collator
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

# Step 2: Prepare the data

### Preprocessing

In [None]:
df1 = pd.read_csv(config.file_1_path, usecols=config.columns)
df2 = pd.read_csv(config.file_2_path, usecols=config.columns)

df = pd.concat([df1, df2], ignore_index=True)

df = df.drop_duplicates()

df['label'] = df['points'].apply(bin_label)
df['text'] = 'Country: ' + df['country'] + '; ' + 'Variety: ' + df['variety'] + '; ' + 'Description: ' + df['description']

#df['text'] = df['description'][:]

df = df[['text', 'label']].drop_duplicates()

df = df.dropna()

In [None]:
df.head()

Unnamed: 0,text,label
0,Country: US; Variety: Cabernet Sauvignon; Desc...,3
1,Country: Spain; Variety: Tinta de Toro; Descri...,3
2,Country: US; Variety: Sauvignon Blanc; Descrip...,3
3,Country: US; Variety: Pinot Noir; Description:...,3
4,Country: France; Variety: Provence red blend; ...,3


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 169401 entries, 0 to 280900
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    169401 non-null  object
 1   label   169401 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.9+ MB


In [None]:
100*(df['label'].value_counts()/len(df))

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,35.563544
1,35.414195
0,19.645693
3,9.376568


#### Sanity check

In [None]:
num_labels = len(df['label'].unique())
assert num_labels == len(config.id2label), "The number of labels does not match the number of unique labels in the dataset"

### Splitting

In [None]:
dataset = Dataset.from_pandas(df)

preprocessing = partial(preprocessing, tokenizer=tokenizer)

dataset = dataset.map(preprocessing, batched=True)

dataset = dataset.train_test_split(test_size=0.2, seed=config.random_state)

Map:   0%|          | 0/169401 [00:00<?, ? examples/s]

# Step 3: Setup Training Specs

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=config.num_labels,
    id2label=config.id2label,
    label2id=config.label2id,
    )

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
     #Basic training parameters
     output_dir=config.output_dir, #temporary storage for checkpoints before pushing to hub
     learning_rate=config.params['lr'],
     per_device_train_batch_size=config.params['train_bsz'],
     per_device_eval_batch_size=config.params['val_bsz'],
     num_train_epochs=config.params['n_epochs'],
     adam_beta1=config.params['betas'][0],
     adam_beta2=config.params['betas'][1],
     adam_epsilon=config.params['eps'],
     warmup_ratio=config.params['warm_up_ratio'],
     #max_grad_norm=config.params['max_grad_norm'],

     # Logging and evaluation parameters
     logging_strategy="steps",  # Log based on steps
     logging_steps=350,  # Log every 350 steps
     evaluation_strategy="steps",
     save_steps=350,

     # Hub integration parameters
     report_to="tensorboard",  # Enable TensorBoard reporting
     push_to_hub=True,
     hub_strategy="every_save",  # Push whenever we save
     hub_model_id=config.hf_repository_id,  # Where to push the model

     # Additional useful parameters for monitoring
     metric_for_best_model="eval_loss",  # Track best model based on eval loss
     load_best_model_at_end=True,  # Load best model when training ends
     greater_is_better=False,  # For loss, lower is better
     )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

metrics_callback = MetricsCallback()
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10, #every early_stopping_patience * logging_steps
    early_stopping_threshold=0.0025 #early_stopping_threshold-percent improvement at least
)

trainer.add_callback(metrics_callback)
trainer.add_callback(early_stopping_callback)



# Step 4: Run fine-tuning, push to hub & analyze results

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,F1
350,1.1457,0.989392,0.546117,0.53054
700,0.9441,1.12132,0.497683,0.482693
1050,0.8589,0.82324,0.629674,0.627701
1400,0.8131,0.826841,0.617721,0.595636
1750,0.7837,0.747377,0.667926,0.666325
2100,0.7726,0.800848,0.639651,0.626907
2450,0.7576,0.75706,0.653316,0.654979
2800,0.7528,0.741444,0.666568,0.659804
3150,0.7588,0.762749,0.658806,0.639685
3500,0.7416,0.725905,0.673593,0.673897


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

TrainOutput(global_step=11900, training_loss=0.6482569590336135, metrics={'train_runtime': 4777.9112, 'train_samples_per_second': 141.819, 'train_steps_per_second': 4.432, 'total_flos': 2.890786856928307e+16, 'train_loss': 0.6482569590336135, 'epoch': 2.809917355371901})

In [None]:
trainer.evaluate()

{'eval_loss': 0.6671051979064941,
 'eval_accuracy': 0.701927333903958,
 'eval_f1': 0.7023830463706187,
 'eval_runtime': 68.1257,
 'eval_samples_per_second': 497.331,
 'eval_steps_per_second': 15.545,
 'epoch': 2.809917355371901}

In [None]:
trainer.create_model_card()
trainer.push_to_hub()

events.out.tfevents.1737727152.236db8c19ecc.2035.1:   0%|          | 0.00/457 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/scbtm/ModernBERT_wine_quality_reviews_ft/commit/2451333c3e2c0b6dbd3bc1a0c3fbcfc6b081986c', commit_message='End of training', commit_description='', oid='2451333c3e2c0b6dbd3bc1a0c3fbcfc6b081986c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/scbtm/ModernBERT_wine_quality_reviews_ft', endpoint='https://huggingface.co', repo_type='model', repo_id='scbtm/ModernBERT_wine_quality_reviews_ft'), pr_revision=None, pr_num=None)

In [None]:
train_history_df = pd.DataFrame(metrics_callback.training_history["train"])
train_history_df = train_history_df.add_prefix("train_")
eval_history_df = pd.DataFrame(metrics_callback.training_history["eval"])
train_res_df = pd.concat([train_history_df, eval_history_df], axis=1)

In [None]:
train_res_df

Unnamed: 0,train_loss,train_grad_norm,train_learning_rate,train_epoch,eval_loss,eval_accuracy,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,1.1457,16.173458,1.1e-05,0.082645,0.989392,0.546117,0.53054,,,,
1,0.9441,12.347157,2.2e-05,0.165289,1.12132,0.497683,0.482693,,,,
2,0.8589,6.501156,3.3e-05,0.247934,0.82324,0.629674,0.627701,,,,
3,0.8131,3.530661,4.4e-05,0.330579,0.826841,0.617721,0.595636,,,,
4,0.7837,4.070856,5.5e-05,0.413223,0.747377,0.667926,0.666325,,,,
5,0.7726,5.176357,6.6e-05,0.495868,0.800848,0.639651,0.626907,,,,
6,0.7576,2.207669,7.7e-05,0.578512,0.75706,0.653316,0.654979,,,,
7,0.7528,1.95484,8.8e-05,0.661157,0.741444,0.666568,0.659804,,,,
8,0.7588,2.082675,9.9e-05,0.743802,0.762749,0.658806,0.639685,,,,
9,0.7416,4.913263,9.8e-05,0.826446,0.725905,0.673593,0.673897,,,,
