In [1]:
# Step 0: Import necessary libraries

import torch  # PyTorch - core DL framework

from datasets import load_dataset  # For loading datasets from Hugging Face

from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,  # Tokenizer & model loader
    Trainer, TrainingArguments, DataCollatorForSeq2Seq  # For training setup
)

import evaluate  # For evaluation metrics

import psutil  # To check system memory

import time  # To measure training time


In [2]:
# Step 1: Display system memory info
print(f"Total RAM: {psutil.virtual_memory().total / (1024 ** 3):.2f} GB")
print(f"Available RAM: {psutil.virtual_memory().available / (1024 ** 3):.2f} GB")

Total RAM: 12.67 GB
Available RAM: 10.72 GB


In [3]:
# Step 2: Load the Bitext customer support dataset
print("\nLoading the Bitext customer support dataset...")
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")

full_data = dataset['train']
print(f"Loaded {len(full_data)} samples.")



Loading the Bitext customer support dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded 26872 samples.


In [4]:
# Step 3: Split the dataset into training and evaluation (80:20)

# We'll use 80% of the data for training and keep 20% aside for validation
split_dataset = full_data.train_test_split(test_size=0.2, seed=42)

# Extract the training and validation sets
train_data = split_dataset["train"]
eval_data = split_dataset["test"]

# Quick check on how many samples we got in each set
print(f"Split into {len(train_data)} training and {len(eval_data)} validation samples.")


Split into 21497 training and 5375 validation samples.


In [5]:
# Step 4: Load FLAN-T5 model and tokenizer
print("\nLoading FLAN-T5 model and tokenizer...")

# Specify the model checkpoint to use
model_name = "google/flan-t5-base"

# Load the tokenizer for processing text inputs
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the pre-trained FLAN-T5 model for sequence-to-sequence tasks
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



Loading FLAN-T5 model and tokenizer...


In [6]:
# Step 5: Tokenize inputs and targets to get model-ready data
def preprocess(example):
    # Combine instruction with a clear prompt so the model understands the task
    input_text = "Answer the customer query: " + example["instruction"]
    target_text = example["response"]

    # Tokenize the input text, making sure it's the same length for batching
    inputs = tokenizer(input_text, max_length=128, padding="max_length", truncation=True)
    # Tokenize the target/response text similarly
    targets = tokenizer(target_text, max_length=128, padding="max_length", truncation=True)

    # Assign the target tokens as labels that the model should predict
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the preprocessing function to both training and validation datasets
# Removing original columns since we only need tokenized data for training
print("🔄 Tokenizing the dataset...")
tokenized_train = train_data.map(preprocess, remove_columns=train_data.column_names)
tokenized_eval = eval_data.map(preprocess, remove_columns=eval_data.column_names)


🔄 Tokenizing the dataset...


Map:   0%|          | 0/21497 [00:00<?, ? examples/s]

Map:   0%|          | 0/5375 [00:00<?, ? examples/s]

In [7]:
# Step 6: Set up training parameters for the Trainer
training_args = TrainingArguments(
    output_dir="voxa_model_cpu",               # Where to save model checkpoints and final model
    eval_steps=500,                            # Check performance every 500 steps (not just epochs)
    save_strategy="epoch",                     # Save model at the end of each epoch
    learning_rate=2e-4,                        # How fast the model learns (tweak if needed)
    per_device_train_batch_size=2,             # Keep batch size small to avoid running out of memory
    num_train_epochs=2,                        # Go through the whole dataset once
    weight_decay=0.01,                         # Small regularization to prevent overfitting
    logging_dir="logs",                        # Folder to store training logs
    logging_steps=20,                          # Log training info every 20 steps
    save_total_limit=2,                        # Keep only 2 recent saved models to save space
    report_to="none",                          # Turn off external logging services like WandB
    fp16=False,                                # Use full precision since CPU doesn't support half precision
    dataloader_num_workers=0,                  # Use single thread for loading data, safer on limited RAM
    gradient_accumulation_steps=4              # No gradient accumulation here, batch size stays 1
)


In [8]:
# step 7: Data collator automatically pads and batches sequences for us, so no manual padding needed
print("\nSetting up Trainer...")

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer class wraps everything — it handles the training loop, evaluation, saving checkpoints, and more
trainer = Trainer(
    model=model,                      # Our FLAN-T5 model
    args=training_args,               # All the training configs we set earlier
    train_dataset=tokenized_train,   # Tokenized training data
    eval_dataset=tokenized_eval,     # Tokenized validation data
    tokenizer=tokenizer,              # So trainer knows how to decode tokens during eval
    data_collator=data_collator      # Helps prepare batches dynamically
)



Setting up Trainer...


  trainer = Trainer(


In [9]:
# Step 8: Check memory before training
print(f"Memory usage before training: {psutil.virtual_memory().available / (1024 ** 3):.2f} GB")


Memory usage before training: 10.41 GB


In [10]:
# Step 9: Start training and measure time
print("\nStarting training...")
start_time = time.time()  # Start timer

trainer.train()  # Begin model training

end_time = time.time()  # End timer
duration = end_time - start_time
minutes, seconds = divmod(duration, 60)
print(f"\nTraining completed in {int(minutes)} minutes and {int(seconds)} seconds.")



Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
20,5.3563
40,2.117
60,1.5467
80,1.3885
100,1.2205
120,1.2179
140,1.0909
160,1.1301
180,1.0399
200,1.0535



Training completed in 63 minutes and 8 seconds.


In [11]:
# Step 10: Check memory after training
print(f"Memory usage after training: {psutil.virtual_memory().available / (1024 ** 3):.2f} GB")


Memory usage after training: 8.92 GB


In [12]:
#  STEP 11: Evaluate model performance and save everything

print("\n🔍 Evaluating the model on GPU...")
eval_results = trainer.evaluate()

# Save the model and tokenizer so we can load them later
model.save_pretrained("voxa_model_cpu")
tokenizer.save_pretrained("voxa_model_cpu")
print("\n✅ Model and tokenizer saved to 'voxa_model_cpu' directory.")




🔍 Evaluating the model on GPU...



✅ Model and tokenizer saved to 'voxa_model_cpu' directory.


In [14]:
# step 12 : save model and download from google drive
import shutil
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Path where the model will be saved inside your Google Drive
drive_save_path = "/content/drive/MyDrive/voxa_model_cpu.zip"

# Compress the 'voxa_model_cpu' folder into a zip archive
print("\n📦 Creating ZIP archive of the model directory...")
shutil.make_archive("/content/voxa_model_cpu", 'zip', 'voxa_model_cpu')
print("✅ Archive created at: /content/voxa_model_cpu.zip")

# Copy the ZIP file to Google Drive
print("\n📁 Saving ZIP to Google Drive...")
shutil.copy("/content/voxa_model_cpu.zip", drive_save_path)
print(f"✅ ZIP file saved at: {drive_save_path}")


Mounted at /content/drive

📦 Creating ZIP archive of the model directory...
✅ Archive created at: /content/voxa_model_cpu.zip

📁 Saving ZIP to Google Drive...
✅ ZIP file saved at: /content/drive/MyDrive/voxa_model_cpu.zip
