# Installations

In [1]:
!pip install transformers datasets scikit-learn evaluate tqdm kaggle fairscale huggingface_hub -Uqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# ***Fine-Tuning***

# Step 1: Setup
Get environment variables & secrets, set configuration, download data and model artifacts.

In [2]:
from google.colab import userdata
import os

from huggingface_hub import login
import kagglehub

from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import numpy as np

import torch

from transformers import (
    TrainerCallback,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

import evaluate


from functools import partial

In [3]:
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USR')
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
#os.environ["HF_USER"] = userdata.get('HF_USER')
#os.environ["HF_PASS"] = userdata.get('HF_PASS')

In [4]:
login(token = os.getenv('HF_TOKEN'))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
# Download latest version
data_path = kagglehub.dataset_download("zynicide/wine-reviews")

print("Path to dataset files:", data_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/zynicide/wine-reviews?dataset_version_number=4...


100%|██████████| 50.9M/50.9M [00:03<00:00, 15.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/zynicide/wine-reviews/versions/4


### Define Configuration

In [22]:
train_bsz, val_bsz = 32, 32
lr = 8e-5
betas = (0.9, 0.98)
n_epochs = 3
eps = 1e-6
#wd = 8e-6

params = {}

params['train_bsz'] = train_bsz
params['val_bsz'] = val_bsz
params['lr'] = lr
params['betas'] = betas
params['n_epochs'] = n_epochs
params['eps'] = eps
#params['wd'] = wd

class CFG:
  file_1_path = f"{data_path}/winemag-data_first150k.csv"
  file_2_path = f"{data_path}/winemag-data-130k-v2.csv"
  columns = [
    'country',
    'description',
    'designation',
    'points',
    'price',
    'province',
    'region_1',
    'region_2',
    'variety',
    'winery'
    ]

  random_state = 1

  id2label = {0: 'bad', 1: 'average', 2: 'good', 3: 'excellent'}
  label2id = {'bad': 0, 'average': 1, 'good': 2, 'excellent': 3}
  num_labels = len(id2label)

  checkpoint = 'answerdotai/ModernBERT-base'

  hf_repository_id = 'ModernBERT_wine_quality_reviews_ft'

  params = params

  output_dir = os.path.join('output_data', 'ModernBERT_wine_quality_reviews_ft')
  output_data_path = 'output_data'

In [23]:
config = CFG()

## Define some functions

In [8]:
def preprocessing(batch, tokenizer):
    labels = [x for x in batch['label']]
    token_dict = tokenizer(batch['text'], truncation = False)
    return dict({'input_ids': token_dict.input_ids, 'label':labels})

In [9]:
def bin_label(x):
    if x <= 85:
        return 0
    if x > 85 and x <= 90:
        return 1
    if x > 90 and x <= 95:
        return 2
    if x > 95:
        return 3

In [10]:
class MetricsCallback(TrainerCallback):
  def __init__(self):
    self.training_history = {"train": [], "eval": []}

  def on_log(self, args, state, control, logs=None, **kwargs):
    if logs is not None:
      if "loss" in logs:  # Training logs
        self.training_history["train"].append(logs)
      elif "eval_loss" in logs:  # Evaluation logs
        self.training_history["eval"].append(logs)

In [11]:
def compute_metrics(eval_pred):
    load_accuracy = evaluate.load("accuracy")
    load_f1 = evaluate.load("f1")
    load_rocauc = evaluate.load("roc_auc")

    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
    predictions = np.argmax(logits, axis=-1)

    # Accuracy and F1 are fine for multiclass
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average='macro')["f1"]

    return {"accuracy": accuracy, "f1": f1}

## Download model auxiliary artifacts

In [12]:
#Define the tokenizer and data collator
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

# Step 2: Prepare the data

### Preprocessing

In [13]:
df1 = pd.read_csv(config.file_1_path, usecols=config.columns)
df2 = pd.read_csv(config.file_2_path, usecols=config.columns)

df = pd.concat([df1, df2], ignore_index=True)

df = df.drop_duplicates()

df['label'] = df['points'].apply(bin_label)
df['text'] = 'This wine is from ' + df['country'] + ', ' + df['variety'] + ' variety. ' + 'Description: ' + df['description']

df = df[['text', 'label']].drop_duplicates()

df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['points'].apply(bin_label)


In [14]:
df.head()

Unnamed: 0,text,label
0,"This wine is from US, Cabernet Sauvignon varie...",3
1,"This wine is from Spain, Tinta de Toro variety...",3
2,"This wine is from US, Sauvignon Blanc variety....",3
3,"This wine is from US, Pinot Noir variety. Desc...",3
4,"This wine is from France, Provence red blend v...",2


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 169402 entries, 0 to 280900
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    169402 non-null  object
 1   label   169402 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.9+ MB


In [16]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,94705
2,40152
0,33280
3,1265


#### Sanity check

In [17]:
num_labels = len(df['label'].unique())
assert num_labels == len(config.id2label), "The number of labels does not match the number of unique labels in the dataset"

### Splitting

In [18]:
development_df, evaluation_df = train_test_split(df,test_size=0.025,random_state=config.random_state)

dataset = Dataset.from_pandas(development_df)
eval_dataset = Dataset.from_pandas(evaluation_df)

In [19]:
preprocessing = partial(preprocessing, tokenizer=tokenizer)

dataset = dataset.map(preprocessing, batched=True)
eval_dataset = eval_dataset.map(preprocessing, batched=True)  # Add preprocessing for eval dataset

dataset = dataset.train_test_split(test_size=0.025)

Map:   0%|          | 0/165166 [00:00<?, ? examples/s]

Map:   0%|          | 0/4236 [00:00<?, ? examples/s]

# Step 3: Setup Training Specs

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=config.num_labels,
    id2label=config.id2label,
    label2id=config.label2id,
    )

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
training_args = TrainingArguments(
     #Basic training parameters
     output_dir=config.output_dir,
     learning_rate=config.params['lr'],
     per_device_train_batch_size=config.params['train_bsz'],
     per_device_eval_batch_size=config.params['val_bsz'],
     num_train_epochs=config.params['n_epochs'],
     adam_beta1=config.params['betas'][0],
     adam_beta2=config.params['betas'][1],
     adam_epsilon=config.params['eps'],
     # Logging and evaluation parameters
     #logging_dir=os.path.join(config.output_data_path, 'logs'),  # Directory for storing logs
     logging_strategy="steps",  # Log based on steps
     logging_steps=250,  # Log every 250 steps
     evaluation_strategy="steps",

     # Saving parameters
     save_strategy="steps",  # Save each epoch
     save_total_limit=2,  # Keep only the last 2 checkpoints to save space

     # Hub integration parameters
     report_to="tensorboard",  # Enable TensorBoard reporting
     push_to_hub=True,
     hub_strategy="every_save",  # Push whenever we save
     hub_model_id=config.hf_repository_id,  # Where to push the model
     #hub_token=os.getenv('HF_TOKEN'),  # Authentication token

     # Additional useful parameters for monitoring
     metric_for_best_model="eval_loss",  # Track best model based on eval loss
     load_best_model_at_end=True,  # Load best model when training ends
     greater_is_better=False,  # For loss, lower is better
     )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

metrics_callback = MetricsCallback()
trainer.add_callback(metrics_callback)



# Step 4: Run fine-tuning, push to hub & analyze results

In [25]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
250,0.7593,0.593836,0.740194,0.529304
500,0.6378,0.62115,0.730751,0.512079
750,0.6001,0.572646,0.749879,0.548203
1000,0.5945,0.566066,0.750363,0.539007
1250,0.5806,0.551881,0.751332,0.569303
1500,0.6042,0.5528,0.753511,0.59447
1750,0.5719,0.528983,0.77046,0.604549
2000,0.5699,0.524746,0.76368,0.595825
2250,0.5622,0.531774,0.769976,0.596023
2500,0.5597,0.534379,0.758838,0.615154


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

TrainOutput(global_step=15099, training_loss=0.4004478630178598, metrics={'train_runtime': 3756.7147, 'train_samples_per_second': 128.599, 'train_steps_per_second': 4.019, 'total_flos': 3.66684485997408e+16, 'train_loss': 0.4004478630178598, 'epoch': 3.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.4819750189781189,
 'eval_accuracy': 0.7864406779661017,
 'eval_f1': 0.6767332567267599,
 'eval_runtime': 12.0267,
 'eval_samples_per_second': 343.403,
 'eval_steps_per_second': 10.809,
 'epoch': 3.0}

In [27]:
trainer.create_model_card()
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/scbtm/ModernBERT_wine_quality_reviews_ft/commit/7b13af0b5c0c39fa89fbbe5e74dc60fdea7c827c', commit_message='End of training', commit_description='', oid='7b13af0b5c0c39fa89fbbe5e74dc60fdea7c827c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/scbtm/ModernBERT_wine_quality_reviews_ft', endpoint='https://huggingface.co', repo_type='model', repo_id='scbtm/ModernBERT_wine_quality_reviews_ft'), pr_revision=None, pr_num=None)

In [28]:
train_history_df = pd.DataFrame(metrics_callback.training_history["train"])
train_history_df = train_history_df.add_prefix("train_")
eval_history_df = pd.DataFrame(metrics_callback.training_history["eval"])
train_res_df = pd.concat([train_history_df, eval_history_df], axis=1)

args_df = pd.DataFrame([training_args.to_dict()])

In [29]:
train_res_df

Unnamed: 0,train_loss,train_grad_norm,train_learning_rate,train_epoch,eval_loss,eval_accuracy,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.7593,2.333282,7.867541e-05,0.049672,0.593836,0.740194,0.529304,,,,
1,0.6378,3.014782,7.735082e-05,0.099344,0.621150,0.730751,0.512079,,,,
2,0.6001,3.780868,7.602623e-05,0.149016,0.572646,0.749879,0.548203,,,,
3,0.5945,3.095845,7.470164e-05,0.198689,0.566066,0.750363,0.539007,,,,
4,0.5806,2.377105,7.337704e-05,0.248361,0.551881,0.751332,0.569303,,,,
...,...,...,...,...,...,...,...,...,...,...,...
56,0.1895,4.942509,4.498311e-06,2.831313,0.760686,0.772881,0.660281,,,,
57,0.1855,6.389841,3.173720e-06,2.880985,0.762513,0.773608,0.670530,,,,
58,0.1878,13.489380,1.849129e-06,2.930658,0.759816,0.770944,0.670744,,,,
59,0.1899,6.110926,5.245380e-07,2.980330,0.760426,0.772881,0.671402,,,,
