## MODEL DOWNLOAD

In [1]:
from transformers import DistilBertTokenizer, DistilBertModel

model_name = "distilbert-base-uncased"

# Downloads & caches locally (usually in ~/.cache/huggingface/)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
tokenizer.save_pretrained("./distilbert_local")
model.save_pretrained("./distilbert_local")

## DATASET LOADING

In [30]:
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

In [45]:
movies = pd.read_csv('movies_sub.csv')
u_item = pd.read_csv("u_item.csv", encoding="ISO-8859-1")

In [3]:
u_item.columns

Index(['movie id', 'movie title', 'release date', 'video release date',
       'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [4]:
u_item.rename(columns={'movie id': 'Movie_ID'}, inplace=True)

In [5]:
genre_cols = ['Action', 'Adventure', 'Animation', "Children's", 'Comedy',
              'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film Noir',
              'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western']

data = movies[['Movie_ID', 'Movie_Name', 'Llama Summary Few Shot']].merge(u_item[['Movie_ID'] + genre_cols], on='Movie_ID')
data.head()

Unnamed: 0,Movie_ID,Movie_Name,Llama Summary Few Shot,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),"In a world where toys come to life, a showdown...",0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),"When a secret weapon system, GoldenEye, is det...",1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),"At the mysterious Mon Senor hotel, a young bel...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),"In the town of Hollywood, where fame and wealt...",1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),"In a chilling game of cat and mouse, a cunning...",0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
#data = data.sample(n=100, random_state=42).reset_index(drop=True)

texts = data['Llama Summary Few Shot'].tolist()
labels = data[genre_cols].values

## MODEL IMPLEMENTATION

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = DistilBertTokenizer.from_pretrained("./distilbert_local")
model = DistilBertForSequenceClassification.from_pretrained("./distilbert_local", num_labels=len(genre_cols)).to(device)

inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

# Move inputs to GPU if available
inputs = {key: value.to(device) for key, value in inputs.items()}
labels = torch.tensor(labels).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./distilbert_local and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
inputs

{'input_ids': tensor([[ 101, 1999, 1037,  ...,    0,    0,    0],
         [ 101, 2043, 1037,  ...,    0,    0,    0],
         [ 101, 2012, 1996,  ...,    0,    0,    0],
         ...,
         [ 101, 1999, 1996,  ...,    0,    0,    0],
         [ 101, 1999, 1037,  ...,    0,    0,    0],
         [ 101, 1037, 2136,  ...,    0,    0,    0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

### TRAINING

In [77]:
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR

# Prepare data loader
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
train_dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=4e-5)
class_counts = labels.sum(dim=0)
pos_weights = (len(labels) - class_counts) / (class_counts + 1e-5)
loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights) # Handle multi-label classification correctly.

# Initialize scheduler
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)  # Adjust the learning rate every 2 epochs

# Early stopping setup
best_loss = float('inf')
patience_counter = 0
patience = 3  # Stop training if no improvement in loss for 2 consecutive epochs

# Start training
model.train()
epochs = 15
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        input_ids, attention_mask, label = batch
        output = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(output.logits, label.float())
        
        # Backward pass
        loss.backward()

        # Optimizer step
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

    # Step the scheduler to adjust learning rate
    scheduler.step()

    # Early stopping check
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0  # Reset counter
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break

Epoch 1/15 - Loss: 1.1085
Epoch 2/15 - Loss: 0.8057
Epoch 3/15 - Loss: 0.6569
Epoch 4/15 - Loss: 0.6135
Epoch 5/15 - Loss: 0.5983
Epoch 6/15 - Loss: 0.5899
Epoch 7/15 - Loss: 0.5905
Epoch 8/15 - Loss: 0.5975
Epoch 9/15 - Loss: 0.5884
Epoch 10/15 - Loss: 0.5921
Epoch 11/15 - Loss: 0.5894
Epoch 12/15 - Loss: 0.5893
Early stopping triggered!


In [78]:
best_loss

0.5883693272627674

In [36]:
model.save_pretrained("./distilbert_finetuned_local")
tokenizer.save_pretrained("./distilbert_finetuned_local")

('./distilbert_finetuned_local\\tokenizer_config.json',
 './distilbert_finetuned_local\\special_tokens_map.json',
 './distilbert_finetuned_local\\vocab.txt',
 './distilbert_finetuned_local\\added_tokens.json')

### PREDICTION

In [37]:
from tqdm import tqdm

predictions = []

with torch.no_grad(): # Disables gradient tracking since we’re just predicting.
    for text in tqdm(texts, desc="Predicting genres"):
        encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to("cuda")
        outputs = model(**encoded)
        probs = torch.sigmoid(outputs.logits)
        preds = (probs > 0.5).int().squeeze().tolist()
        predictions.append(preds)

df = pd.DataFrame(predictions, columns=genre_cols)
df = pd.concat([data[["Movie_ID", "Movie_Name"]], df], axis=1)

df.head()

Predicting genres: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1633/1633 [00:12<00:00, 127.21it/s]


Unnamed: 0,Movie_ID,Movie_Name,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,1,1,1,0,1,0,0,1,0,1,1,0,1,0,1,1,1
1,2,GoldenEye (1995),0,1,1,0,0,1,0,0,1,1,1,1,0,1,0,0,1,1
2,3,Four Rooms (1995),0,1,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1
3,4,Get Shorty (1995),0,0,1,1,0,1,0,0,1,1,1,1,0,1,0,0,1,1
4,5,Copycat (1995),0,1,1,1,0,1,0,0,1,1,1,1,0,1,0,0,1,1


### CLASSIFICATION REPORT

In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Ensure movies and df have the same genre columns
genres_list = [
    "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", 
    "Documentary", "Drama", "Fantasy", "Film Noir", "Horror", "Musical", 
    "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

# Initialize result dictionary
results = {"Genre": [], "Precision": [], "Recall": [], "F1-score": []}

# Compute metrics for each genre
for genre in genres_list:
    precision = precision_score(data[genre], df[genre])
    recall = recall_score(data[genre], df[genre])
    f1 = f1_score(data[genre], df[genre])

    # Store results rounded to 2 decimal places
    results["Genre"].append(genre)
    results["Precision"].append(round(precision, 2))
    results["Recall"].append(round(recall, 2))
    results["F1-score"].append(round(f1, 2))

# Convert results into a DataFrame
metrics_df = pd.DataFrame(results)

metrics_df.to_csv("DB_Classification_Score.csv", index=False)

# Display the result
print(metrics_df)


          Genre  Precision  Recall  F1-score
0        Action       0.13    0.13      0.13
1     Adventure       0.07    0.82      0.13
2     Animation       0.02    0.93      0.05
3    Children's       0.08    0.93      0.14
4        Comedy       0.22    0.04      0.07
5         Crime       0.07    1.00      0.12
6   Documentary       0.11    0.02      0.03
7         Drama       0.00    0.00      0.00
8       Fantasy       0.01    1.00      0.03
9     Film Noir       0.03    0.67      0.05
10       Horror       0.05    1.00      0.10
11      Musical       0.03    0.96      0.07
12      Mystery       0.00    0.00      0.00
13      Romance       0.16    0.96      0.27
14       Sci-Fi       0.00    0.00      0.00
15     Thriller       0.23    0.05      0.08
16          War       0.04    1.00      0.08
17      Western       0.02    0.74      0.05


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## HYPER-PARAMETER TUNING

In [101]:
!pip install prettytable

Collecting prettytable
  Downloading prettytable-3.16.0-py3-none-any.whl.metadata (33 kB)
Downloading prettytable-3.16.0-py3-none-any.whl (33 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.16.0


In [8]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.16-cp312-cp312-win_amd64.whl.metadata (8.0 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->datasets)
  Using cached frozenlist-1.5.0-cp312-cp312-win_amd6

In [11]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
Downloading alembic-1.15.2-py3-none-any.whl (231 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading mako-1.3.10-py3-none-any.whl (78 kB)
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.10 alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [13]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.4 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.4 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.4 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.4 MB ? eta -:--:--
   -- ------------------------------------- 0.5/10.4 MB 279.8 kB/s eta 0:00:36
   -- ------------------------------------- 0.5/10.4 MB 279.8 kB/s eta 0:00:36
   -- ------------------------------------- 0.5/10.4 MB 279.8 kB/s eta 0:00:36
   --- ------------------------------------ 0.8/10.4 MB 332.2 kB/s eta 0:00:29
   --- ------------------------------------ 0.8/10.4 MB 332.2 kB/s eta 0:00:29
   --- ------------------------------------ 0.8/10.4 MB 332.2 kB/s eta 0:00:29
   --- ------------------

In [26]:
!pip install accelerate>=0.26.0

In [29]:
!pip install transformers[torch]



In [30]:
pip show accelerate

Name: accelerate
Version: 1.6.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: C:\Users\typis\PycharmProjects\Classwork\venv310\Lib\site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [7]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = DistilBertTokenizer.from_pretrained("./distilbert_finetuned_local")
model = DistilBertForSequenceClassification.from_pretrained("./distilbert_finetuned_local", num_labels=len(genre_cols), problem_type="multi_label_classification").to(device)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

train_labels = [list(map(float, lbl)) for lbl in train_labels]
val_labels = [list(map(float, lbl)) for lbl in val_labels]

# Tokenize
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Create HuggingFace datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})

# Final dictionary
encoded_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})


In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
import torch

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()  # convert logits to probs
    preds = (probs > 0.5).astype(int)                     # apply threshold
    labels = np.array(labels)                            # ensure correct format

    return {
        'precision': precision_score(labels, preds, average='micro', zero_division=0),
        'recall': recall_score(labels, preds, average='micro', zero_division=0),
        'f1': f1_score(labels, preds, average='micro', zero_division=0),
        'accuracy': accuracy_score(labels, preds)
    }

In [45]:
import optuna
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import precision_score
import numpy as np

# Objective function for Optuna
def objective(trial):
    # Suggesting hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)  # picks from range
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])   # picks from list 
    weight_decay = trial.suggest_float('weight_decay', 0.0, 0.3)

    # Define TrainingArguments
    training_args = TrainingArguments(
        output_dir="./optuna_results",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=3,
        weight_decay=weight_decay,
        save_strategy="steps",          # Save model every few steps
        save_steps=100,                 # Save every 100 steps
        eval_strategy="steps",         # Evaluate every few steps
        eval_steps=100,                 # Evaluate every 100 steps
        logging_steps=100,              # Log every 100 steps
        load_best_model_at_end=True,    # Load best model at the end based on evaluation
        disable_tqdm=True,              # Disable tqdm to avoid overloading output
        report_to="none",               # Disable logging to platforms like TensorBoard
)

    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset['train'],
        eval_dataset=encoded_dataset['validation'],
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_precision"]  # Maximize precision

# Run Optuna
optuna.logging.set_verbosity(optuna.logging.INFO)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best Parameters:", study.best_params)


[I 2025-04-16 11:28:07,877] A new study created in memory with name: no-name-1278f138-78fe-45a7-8f32-d203e3bb91e1


{'loss': 0.1972, 'grad_norm': 0.30486366152763367, 'learning_rate': 8.887883147393474e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 223.3458, 'train_samples_per_second': 17.542, 'train_steps_per_second': 0.551, 'train_loss': 0.18961629441113975, 'epoch': 3.0}


[I 2025-04-16 11:31:57,500] Trial 0 finished with value: 0.7352112676056338 and parameters: {'learning_rate': 4.5550401130391553e-05, 'batch_size': 32, 'weight_decay': 0.013398179910986284}. Best is trial 0 with value: 0.7352112676056338.


{'eval_loss': 0.16484810411930084, 'eval_precision': 0.7352112676056338, 'eval_runtime': 5.31, 'eval_samples_per_second': 61.582, 'eval_steps_per_second': 2.072, 'epoch': 3.0}
{'loss': 0.146, 'grad_norm': 0.41365566849708557, 'learning_rate': 1.8768384686755683e-05, 'epoch': 1.2195121951219512}
{'loss': 0.1222, 'grad_norm': 0.47034579515457153, 'learning_rate': 6.000776056309641e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 118.792, 'train_samples_per_second': 32.982, 'train_steps_per_second': 2.071, 'train_loss': 0.12961342470432685, 'epoch': 3.0}


[I 2025-04-16 11:34:00,607] Trial 1 finished with value: 0.738498789346247 and parameters: {'learning_rate': 3.140831723089727e-05, 'batch_size': 16, 'weight_decay': 0.2206635125996341}. Best is trial 1 with value: 0.738498789346247.


{'eval_loss': 0.1506294161081314, 'eval_precision': 0.738498789346247, 'eval_runtime': 3.344, 'eval_samples_per_second': 97.787, 'eval_steps_per_second': 6.28, 'epoch': 3.0}
{'loss': 0.1083, 'grad_norm': 0.6516855359077454, 'learning_rate': 1.4720747080344915e-05, 'epoch': 0.6097560975609756}
{'loss': 0.0988, 'grad_norm': 0.5365845561027527, 'learning_rate': 1.0975009909773691e-05, 'epoch': 1.2195121951219512}
{'loss': 0.0872, 'grad_norm': 0.7652616500854492, 'learning_rate': 7.2292727392024645e-06, 'epoch': 1.8292682926829267}
{'loss': 0.0887, 'grad_norm': 1.0014033317565918, 'learning_rate': 3.4835355686312395e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 122.2247, 'train_samples_per_second': 32.056, 'train_steps_per_second': 4.025, 'train_loss': 0.09274790345168696, 'epoch': 3.0}


[I 2025-04-16 11:36:06,525] Trial 2 finished with value: 0.717439293598234 and parameters: {'learning_rate': 1.8429026879210428e-05, 'batch_size': 8, 'weight_decay': 0.056589226276404836}. Best is trial 1 with value: 0.738498789346247.


{'eval_loss': 0.15313979983329773, 'eval_precision': 0.717439293598234, 'eval_runtime': 2.7908, 'eval_samples_per_second': 117.172, 'eval_steps_per_second': 14.691, 'epoch': 3.0}
{'loss': 0.0724, 'grad_norm': 0.3626071810722351, 'learning_rate': 3.5701752804625565e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 215.7235, 'train_samples_per_second': 18.162, 'train_steps_per_second': 0.57, 'train_loss': 0.07134002592505478, 'epoch': 3.0}


[I 2025-04-16 11:39:47,697] Trial 3 finished with value: 0.7139737991266376 and parameters: {'learning_rate': 1.8297148312370602e-05, 'batch_size': 32, 'weight_decay': 0.08475810159645836}. Best is trial 1 with value: 0.738498789346247.


{'eval_loss': 0.157895028591156, 'eval_precision': 0.7139737991266376, 'eval_runtime': 4.7792, 'eval_samples_per_second': 68.422, 'eval_steps_per_second': 2.302, 'epoch': 3.0}
{'loss': 0.0614, 'grad_norm': 0.5271261930465698, 'learning_rate': 8.558425904363631e-06, 'epoch': 0.6097560975609756}
{'loss': 0.0583, 'grad_norm': 0.7633628845214844, 'learning_rate': 6.380709389258382e-06, 'epoch': 1.2195121951219512}
{'loss': 0.0534, 'grad_norm': 0.883817195892334, 'learning_rate': 4.202992874153132e-06, 'epoch': 1.8292682926829267}
{'loss': 0.0572, 'grad_norm': 0.26929861307144165, 'learning_rate': 2.025276359047882e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 122.7711, 'train_samples_per_second': 31.913, 'train_steps_per_second': 4.007, 'train_loss': 0.05695636970240895, 'epoch': 3.0}


[I 2025-04-16 11:41:54,200] Trial 4 finished with value: 0.7044967880085653 and parameters: {'learning_rate': 1.0714365254317829e-05, 'batch_size': 8, 'weight_decay': 0.14093372565214513}. Best is trial 1 with value: 0.738498789346247.


{'eval_loss': 0.16506220400333405, 'eval_precision': 0.7044967880085653, 'eval_runtime': 2.8196, 'eval_samples_per_second': 115.973, 'eval_steps_per_second': 14.541, 'epoch': 3.0}
{'loss': 0.0507, 'grad_norm': 0.40542781352996826, 'learning_rate': 1.3766349194321545e-05, 'epoch': 1.2195121951219512}
{'loss': 0.0461, 'grad_norm': 0.24830102920532227, 'learning_rate': 4.401485796823896e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 112.0484, 'train_samples_per_second': 34.967, 'train_steps_per_second': 2.195, 'train_loss': 0.04718332513561094, 'epoch': 3.0}


[I 2025-04-16 11:43:49,943] Trial 5 finished with value: 0.7103004291845494 and parameters: {'learning_rate': 2.303756395784422e-05, 'batch_size': 16, 'weight_decay': 0.19755231032106343}. Best is trial 1 with value: 0.738498789346247.


{'eval_loss': 0.1733478456735611, 'eval_precision': 0.7103004291845494, 'eval_runtime': 2.941, 'eval_samples_per_second': 111.187, 'eval_steps_per_second': 7.14, 'epoch': 3.0}
{'loss': 0.0374, 'grad_norm': 0.15462477505207062, 'learning_rate': 1.1612003885646233e-05, 'epoch': 1.2195121951219512}
{'loss': 0.0339, 'grad_norm': 0.3457781970500946, 'learning_rate': 3.71268151445832e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 111.8483, 'train_samples_per_second': 35.03, 'train_steps_per_second': 2.199, 'train_loss': 0.03506397764857223, 'epoch': 3.0}


[I 2025-04-16 11:45:45,639] Trial 6 finished with value: 0.6993603411513859 and parameters: {'learning_rate': 1.943233303312227e-05, 'batch_size': 16, 'weight_decay': 0.1910959300719705}. Best is trial 1 with value: 0.738498789346247.


{'eval_loss': 0.18077149987220764, 'eval_precision': 0.6993603411513859, 'eval_runtime': 2.9453, 'eval_samples_per_second': 111.025, 'eval_steps_per_second': 7.13, 'epoch': 3.0}
{'loss': 0.0279, 'grad_norm': 0.2649332582950592, 'learning_rate': 1.3189438184000173e-05, 'epoch': 1.2195121951219512}
{'loss': 0.026, 'grad_norm': 0.21148839592933655, 'learning_rate': 4.217031256108899e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 111.9309, 'train_samples_per_second': 35.004, 'train_steps_per_second': 2.198, 'train_loss': 0.0265638285536107, 'epoch': 3.0}


[I 2025-04-16 11:47:41,428] Trial 7 finished with value: 0.7021276595744681 and parameters: {'learning_rate': 2.2072121042612536e-05, 'batch_size': 16, 'weight_decay': 0.2844028259370367}. Best is trial 1 with value: 0.738498789346247.


{'eval_loss': 0.19087469577789307, 'eval_precision': 0.7021276595744681, 'eval_runtime': 2.9707, 'eval_samples_per_second': 110.077, 'eval_steps_per_second': 7.069, 'epoch': 3.0}
{'loss': 0.0242, 'grad_norm': 0.1868039220571518, 'learning_rate': 6.9114405226793635e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 188.3249, 'train_samples_per_second': 20.804, 'train_steps_per_second': 0.653, 'train_loss': 0.023334283169692126, 'epoch': 3.0}


[I 2025-04-16 11:50:53,451] Trial 8 finished with value: 0.6918367346938775 and parameters: {'learning_rate': 3.5421132678731735e-05, 'batch_size': 32, 'weight_decay': 0.07951492116176469}. Best is trial 1 with value: 0.738498789346247.


{'eval_loss': 0.20247139036655426, 'eval_precision': 0.6918367346938775, 'eval_runtime': 2.8345, 'eval_samples_per_second': 115.366, 'eval_steps_per_second': 3.881, 'epoch': 3.0}
{'loss': 0.0183, 'grad_norm': 0.35307103395462036, 'learning_rate': 1.8353387782633338e-05, 'epoch': 0.6097560975609756}
{'loss': 0.0176, 'grad_norm': 0.14890389144420624, 'learning_rate': 1.3683314555500173e-05, 'epoch': 1.2195121951219512}
{'loss': 0.0182, 'grad_norm': 0.3496118187904358, 'learning_rate': 9.013241328367007e-06, 'epoch': 1.8292682926829267}
{'loss': 0.0167, 'grad_norm': 0.08236336708068848, 'learning_rate': 4.343168101233843e-06, 'epoch': 2.4390243902439024}
{'train_runtime': 122.6753, 'train_samples_per_second': 31.938, 'train_steps_per_second': 4.011, 'train_loss': 0.01706998425770581, 'epoch': 3.0}


[I 2025-04-16 11:52:59,912] Trial 9 finished with value: 0.6821862348178138 and parameters: {'learning_rate': 2.297676027749517e-05, 'batch_size': 8, 'weight_decay': 0.15087534004209116}. Best is trial 1 with value: 0.738498789346247.


{'eval_loss': 0.21954531967639923, 'eval_precision': 0.6821862348178138, 'eval_runtime': 2.7838, 'eval_samples_per_second': 117.465, 'eval_steps_per_second': 14.728, 'epoch': 3.0}
Best Parameters: {'learning_rate': 3.140831723089727e-05, 'batch_size': 16, 'weight_decay': 0.2206635125996341}


In [9]:
import json

best_params = {
    "learning_rate": 3.140831723089727e-05,
    "batch_size": 16,
    "weight_decay": 0.2206635125996341
}

#with open("best_hyperparams.json", "w") as f:
#    json.dump(best_params, f)

In [40]:
#with open("best_hyperparams.json", "r") as f:
#    best_params = json.load(f)

In [23]:
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR

inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

# Move inputs to GPU if available
inputs = {key: value.to(device) for key, value in inputs.items()}
labels = torch.tensor(labels).to(device)

# Prepare data loader
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
train_dataloader = DataLoader(dataset, batch_size = best_params['batch_size'], shuffle=True)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr = best_params['learning_rate'], weight_decay = best_params['weight_decay'])
class_counts = labels.sum(dim=0)
pos_weights = (len(labels) - class_counts) / (class_counts + 1e-5)
loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights) # Handle multi-label classification correctly.

# Initialize scheduler
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)  # Adjust the learning rate every 2 epochs

# Early stopping setup
best_loss = float('inf')
patience_counter = 0
patience = 3  # Stop training if no improvement in loss for 2 consecutive epochs

# Start training
model.train()
epochs = 15
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        input_ids, attention_mask, label = batch
        output = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(output.logits, label.float())
        
        # Backward pass
        loss.backward()

        # Optimizer step
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

    # Step the scheduler to adjust learning rate
    scheduler.step()

    # Early stopping check
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0  # Reset counter
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break

Epoch 1/15 - Loss: 1.0377
Epoch 2/15 - Loss: 0.8076
Epoch 3/15 - Loss: 0.6648
Epoch 4/15 - Loss: 0.6362
Epoch 5/15 - Loss: 0.6210
Epoch 6/15 - Loss: 0.6246
Epoch 7/15 - Loss: 0.6171
Epoch 8/15 - Loss: 0.6154
Epoch 9/15 - Loss: 0.6180
Epoch 10/15 - Loss: 0.6138
Epoch 11/15 - Loss: 0.6148
Epoch 12/15 - Loss: 0.6225
Epoch 13/15 - Loss: 0.6121
Epoch 14/15 - Loss: 0.6147
Epoch 15/15 - Loss: 0.6167


In [24]:
best_loss

0.6121297030194291

In [25]:
from tqdm import tqdm

predictions = []

with torch.no_grad(): # Disables gradient tracking since we’re just predicting.
    for text in tqdm(texts, desc="Predicting genres"):
        encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to("cuda")
        outputs = model(**encoded)
        probs = torch.sigmoid(outputs.logits)
        preds = (probs > 0.5).int().squeeze().tolist()
        predictions.append(preds)

df = pd.DataFrame(predictions, columns=genre_cols)
df = pd.concat([data[["Movie_ID", "Movie_Name"]], df], axis=1)

df.head()

Predicting genres: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1633/1633 [00:38<00:00, 42.12it/s]


Unnamed: 0,Movie_ID,Movie_Name,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,1,0,0,1,0,0,1,0,1,0
1,2,GoldenEye (1995),1,1,0,0,0,0,0,0,0,1,1,0,1,0,1,1,0,0
2,3,Four Rooms (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,Get Shorty (1995),0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0,0
4,5,Copycat (1995),1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0


In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

genres_list = [
    "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", 
    "Documentary", "Drama", "Fantasy", "Film Noir", "Horror", "Musical", 
    "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

# Initialize result dictionary
results = {"Genre": [], "Precision": [], "Recall": [], "F1-score": []}

# Compute metrics for each genre
for genre in genres_list:
    precision = precision_score(data[genre], df[genre])
    recall = recall_score(data[genre], df[genre])
    f1 = f1_score(data[genre], df[genre])

    # Store results rounded to 2 decimal places
    results["Genre"].append(genre)
    results["Precision"].append(round(precision, 2))
    results["Recall"].append(round(recall, 2))
    results["F1-score"].append(round(f1, 2))

# Convert results into a DataFrame
metrics_df = pd.DataFrame(results)

metrics_df.to_csv("DB_Classification_Score.csv", index=False)

# Display the result
print(metrics_df)

          Genre  Precision  Recall  F1-score
0        Action       0.40    0.84      0.54
1     Adventure       0.37    0.90      0.52
2     Animation       0.23    1.00      0.38
3    Children's       0.48    0.95      0.64
4        Comedy       0.64    0.76      0.69
5         Crime       0.25    0.85      0.39
6   Documentary       0.18    0.88      0.30
7         Drama       0.73    0.81      0.77
8       Fantasy       0.12    1.00      0.22
9     Film Noir       0.08    1.00      0.16
10       Horror       0.28    1.00      0.44
11      Musical       0.18    0.89      0.30
12      Mystery       0.16    0.90      0.27
13      Romance       0.32    0.71      0.45
14       Sci-Fi       0.31    0.96      0.47
15     Thriller       0.42    0.86      0.56
16          War       0.24    0.86      0.38
17      Western       0.18    0.96      0.30


In [28]:
model.save_pretrained("./distilbert_HPT_local")
tokenizer.save_pretrained("./distilbert_HPT_local")

('./distilbert_HPT_local\\tokenizer_config.json',
 './distilbert_HPT_local\\special_tokens_map.json',
 './distilbert_HPT_local\\vocab.txt',
 './distilbert_HPT_local\\added_tokens.json')