# 0- You must manually change the env into text_classification environment
You need to run in terminal:

```sh
conda env create -f env.yml
```
Then, you need to activate text_classification environment



# 1- Fine-tuning a BERT model for single-sentence binary classification

In [1]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
from torch import cuda
from datasets import load_dataset, concatenate_datasets
import pandas as pd 
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import os
from datetime import datetime
# Early stopping callback
from transformers.integrations import TensorBoardCallback
from transformers import EarlyStoppingCallback

# Set seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Device configuration
device = 'cuda' if cuda.is_available() else 'cpu'
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


### Create output directions

In [2]:
# Create output directories
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f'./results_{timestamp}'
log_dir = f'./logs_{timestamp}'
os.makedirs(output_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

### Loading pre-trained model and tokenizer


In [3]:
model_path = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
# id2label and label2id are passed to the model to use during inference.
model = DistilBertForSequenceClassification.from_pretrained(model_path,
                                                            id2label={
                                                                0: "NEG", 1: "POS"},
                                                            label2id={"NEG": 0, "POS": 1}).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Loading popular IMDB dataset

In [4]:
print("Loading IMDB dataset...")
imdb_train = load_dataset('imdb', split='train', download_mode="force_redownload")
# Fixed indentation and spacing in the following lines
imdb_test_start = load_dataset('imdb', split='test[:6250]') 
imdb_test_end = load_dataset('imdb', split='test[-6250:]') 
# Concatenate the two datasets using concatenate_datasets
imdb_test = concatenate_datasets([imdb_test_start, imdb_test_end])

imdb_val_start = load_dataset('imdb', split='test[6250:12500]')
imdb_val_end = load_dataset('imdb', split='test[-12500:-6250]')
imdb_val = concatenate_datasets([imdb_val_start, imdb_val_end])

# Check the shape of the dataset
print(f"imdb_train shape: {imdb_train.shape}")
print(f"imdb_test shape: {imdb_test.shape}")
print(f"imdb_val shape: {imdb_val.shape}")

Loading IMDB dataset...


Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 336773.95 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 351937.28 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 383772.56 examples/s]


imdb_train shape: (25000, 2)
imdb_test shape: (12500, 2)
imdb_val shape: (12500, 2)


### Pass these dataset to the tokenizer model to make them ready for training

In [5]:

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'], 
        truncation=True, 
        padding=True,
        max_length=512,
        return_tensors="pt"
    )


print("Tokenizing datasets...")
enc_train = imdb_train.map(
    tokenize_function, 
    batched=True, 
    batch_size=64,
    desc="Tokenizing training data"
)
enc_test = imdb_test.map(
    tokenize_function, 
    batched=True, 
    batch_size=64,
    desc="Tokenizing test data"
)
enc_val = imdb_val.map(
    tokenize_function, 
    batched=True, 
    batch_size=64,
    desc="Tokenizing validation data"
)

# Set format for PyTorch
enc_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
enc_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
enc_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Display sample of tokenized data
print("Sample of tokenized training data:")
pd.DataFrame({
    'text': enc_train['text'][:3],
    'label': enc_train['label'][:3],
    'input_ids_shape': [ids.shape for ids in enc_train['input_ids'][:3]],
    'attention_mask_shape': [mask.shape for mask in enc_train['attention_mask'][:3]]
})

Tokenizing datasets...


Tokenizing training data: 100%|██████████| 25000/25000 [00:05<00:00, 4491.42 examples/s]
Tokenizing test data: 100%|██████████| 12500/12500 [00:02<00:00, 4462.95 examples/s]
Tokenizing validation data: 100%|██████████| 12500/12500 [00:02<00:00, 4413.03 examples/s]


Sample of tokenized training data:


Unnamed: 0,text,label,input_ids_shape,attention_mask_shape
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,"(512,)","(512,)"
1,"""I Am Curious: Yellow"" is a risible and preten...",0,"(512,)","(512,)"
2,If only to avoid making this type of film in t...,0,"(512,)","(512,)"


### Fine-tune the model

In [6]:
# Function to calculate the metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# TrainingArguments setup
# TrainingArguments setup
training_args = TrainingArguments(
    output_dir=output_dir,  # output directory for model predictions and checkpoints
    num_train_epochs=10,  # total number of training epochs
    per_device_train_batch_size=16,  # reduced batch size to prevent CUDA OOM errors
    per_device_eval_batch_size=16,  # batch size for evaluation
    warmup_ratio=0.1,  # ratio of warmup steps - more flexible than fixed steps
    weight_decay=0.01,  # strength of weight decay
    logging_dir=log_dir,  # directory to save logs
    do_eval=True,  # whether to evaluate during training
    do_train=True,  # whether to train the model
    save_strategy='epoch',  # save the model after each epoch
    evaluation_strategy='epoch',  # evaluate the model after each epoch
    logging_strategy='steps',  # log steps instead of epochs for more frequent updates
    report_to='tensorboard',  # report logs to TensorBoard
    logging_steps=100,  # how often to log the training loss
    fp16=True if cuda.is_available() else False,  # whether to use mixed precision training
    load_best_model_at_end=True,  # load the best model when finished training
    metric_for_best_model='f1',  # use F1 score to determine best model
    greater_is_better=True,  # higher F1 is better
    seed=seed,
    dataloader_drop_last=True,  # drop last incomplete batch
    gradient_accumulation_steps=2,  # accumulate gradients for effective larger batch size
    save_total_limit=3,  # limit the total amount of checkpoints saved
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=enc_train,
    eval_dataset=enc_val,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
        TensorBoardCallback()
    ]
)
# Train the model
print("Starting training...")
results = trainer.train()
print("Training completed!")
print(results)

You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
EarlyStoppingCallback


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2338,0.243271,0.902529,0.897007,0.950538,0.849183
2,0.1629,0.239724,0.903809,0.897685,0.958379,0.84422
3,0.1143,0.274593,0.926536,0.925848,0.934301,0.917547
4,0.046,0.3677,0.926777,0.925604,0.94036,0.911303
5,0.0242,0.407298,0.921015,0.91929,0.939495,0.899936
6,0.023,0.424802,0.928217,0.929675,0.910893,0.949248
7,0.023,0.43131,0.929337,0.929298,0.929521,0.929075
8,0.0089,0.483881,0.927737,0.92703,0.935879,0.918348
9,0.0004,0.514327,0.926296,0.925128,0.939719,0.910983


Training completed!
TrainOutput(global_step=7029, training_loss=0.08254671909698366, metrics={'train_runtime': 2622.9307, 'train_samples_per_second': 95.313, 'train_steps_per_second': 2.978, 'total_flos': 2.979562704489677e+16, 'train_loss': 0.08254671909698366, 'epoch': 9.0})


### Evaluate on test set

In [7]:
# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(enc_test)
print("Test results:", test_results)

Evaluating on test set...


Test results: {'eval_loss': 0.40319886803627014, 'eval_accuracy': 0.9287772087067862, 'eval_f1': 0.9303381340012523, 'eval_precision': 0.9101071975497703, 'eval_recall': 0.9514889529298751, 'eval_runtime': 34.2639, 'eval_samples_per_second': 364.816, 'eval_steps_per_second': 22.823, 'epoch': 9.0}


### Save the final model


In [8]:
# Save the final model
trainer.save_model(f"{output_dir}/final_model")
print(f"Final model saved to {output_dir}/final_model")

# Example of using the model for inference a new sentence
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    prediction = torch.argmax(probabilities, dim=-1).item()
    confidence = probabilities[0][prediction].item()
    
    sentiment = "Positive" if prediction == 1 else "Negative"
    return sentiment, confidence

Final model saved to ./results_20250318_021030/final_model


### Run the model for inference

In [10]:
test_texts = [
        "This movie was absolutely fantastic! I loved every minute of it.",
        "What a waste of time. Terrible acting and boring plot."
    ]
for text in test_texts:
        sentiment, confidence = predict_sentiment(text)
        print(f"Text: {text}")
        print(f"Sentiment: {sentiment} (confidence: {confidence:.4f})")
        print("-" * 50)

Text: This movie was absolutely fantastic! I loved every minute of it.
Sentiment: Positive (confidence: 0.9994)
--------------------------------------------------
Text: What a waste of time. Terrible acting and boring plot.
Sentiment: Negative (confidence: 0.9998)
--------------------------------------------------
