<a href="https://colab.research.google.com/github/rohithreddy878/tcdzMachineLearning/blob/main/bert_linelengthpredictor_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORTS

In [None]:
!pip install datasets

In [4]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# 0. Enable GPU

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
print(f"Using device: {device}")

True
Using device: cuda


In [8]:
# Check GPU memory usage
if torch.cuda.is_available():
    print("\nGPU Memory Usage:")
    print(f"Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    print(f"Cached: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")


GPU Memory Usage:
Allocated: 0.00 MB
Cached: 0.00 MB


# 1. Preprocessing

In [9]:
# Load dataset (example data - replace with your actual data)
import pandas as pd
from google.colab import files


uploaded = files.upload()
data = pd.read_excel("AllDeliveries2023.xlsx", engine="openpyxl")

data['length'] = data['length'].fillna('Unknown')
data['line'] = data['line'].fillna('Unknown')


Saving AllDeliveries2023.xlsx to AllDeliveries2023.xlsx


# 2. Prepare the dataset for training.

In [10]:
from sklearn.preprocessing import LabelEncoder

line_encoder = LabelEncoder()
length_encoder = LabelEncoder()

data['line'] = line_encoder.fit_transform(data['line'])
data['length'] = length_encoder.fit_transform(data['length'])

In [12]:
# Split dataset into train, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    data['cb_comm'], data[['line', 'length']], test_size=0.3, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

In [13]:

# Convert data into Hugging Face Dataset format
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'line': train_labels['line'], 'length': train_labels['length']}))
val_dataset = Dataset.from_pandas(pd.DataFrame({'text': val_texts, 'line': val_labels['line'], 'length': val_labels['length']}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': test_texts, 'line': test_labels['line'], 'length': test_labels['length']}))


In [14]:
# Tokenize data using MobileBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = DatasetDict({
    "train": train_dataset.map(tokenize_function, batched=True),
    "val": val_dataset.map(tokenize_function, batched=True),
    "test": test_dataset.map(tokenize_function, batched=True),
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/12504 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2679 [00:00<?, ? examples/s]

Map:   0%|          | 0/2680 [00:00<?, ? examples/s]

In [15]:

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("line", "labels_line")
tokenized_datasets = tokenized_datasets.rename_column("length", "labels_length")
tokenized_datasets.set_format("torch")


# 3. Define the multi-task model.

In [16]:
# Create a custom multi-task model
from torch import nn

class MultiTaskModel(nn.Module):
    def __init__(self, base_model_name, num_line_labels, num_length_labels):
        super(MultiTaskModel, self).__init__()
        self.base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=1)  # Dummy num_labels
        self.dropout = nn.Dropout(0.1)
        self.classifier_line = nn.Linear(self.base_model.config.hidden_size, num_line_labels)
        self.classifier_length = nn.Linear(self.base_model.config.hidden_size, num_length_labels)

    def forward(self, input_ids, attention_mask, labels_line=None, labels_length=None):
        outputs = self.base_model.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # Use [CLS] token representation
        pooled_output = self.dropout(pooled_output)

        logits_line = self.classifier_line(pooled_output)
        logits_length = self.classifier_length(pooled_output)

        loss = None
        if labels_line is not None and labels_length is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss_line = loss_fn(logits_line, labels_line)
            loss_length = loss_fn(logits_length, labels_length)
            loss = loss_line + loss_length

        return {"loss": loss, "logits_line": logits_line, "logits_length": logits_length}


# Initiate the model and training args

In [17]:
# Initialize the model
num_line_labels = len(line_encoder.classes_)
num_length_labels = len(length_encoder.classes_)

model = MultiTaskModel(base_model_name="google/mobilebert-uncased",
                       num_line_labels=num_line_labels,
                       num_length_labels=num_length_labels).to(device)


pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/147M [00:00<?, ?B/s]

In [22]:
# Define training loop using Hugging Face Trainer API
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# Custom collator to handle multi-task labels
class MultiTaskCollator:
    def __call__(self, batch):
        input_ids = torch.stack([example["input_ids"] for example in batch])
        attention_mask = torch.stack([example["attention_mask"] for example in batch])
        labels_line = torch.tensor([example["labels_line"] for example in batch])
        labels_length = torch.tensor([example["labels_length"] for example in batch])

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels_line": labels_line, "labels_length": labels_length}

# Use Hugging Face's DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)


In [23]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disable WandB or other integrations
)

# 4: Training the Model

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.1035,2.121487
2,2.1585,1.98422
3,1.9009,1.953209


TrainOutput(global_step=2346, training_loss=170019.9915445788, metrics={'train_runtime': 469.3467, 'train_samples_per_second': 79.924, 'train_steps_per_second': 4.998, 'total_flos': 0.0, 'train_loss': 170019.9915445788, 'epoch': 3.0})

In [25]:

# Evaluate the model
results = trainer.evaluate(tokenized_datasets["test"])
print("Test Results:", results)

Test Results: {'eval_loss': 1.9694018363952637, 'eval_runtime': 8.0505, 'eval_samples_per_second': 332.897, 'eval_steps_per_second': 20.868, 'epoch': 3.0}
