<a href="https://colab.research.google.com/github/rohithreddy878/tcdzMachineLearning/blob/main/LineAndLengthPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### STEP 1: Install required libraries

In [None]:
!pip install datasets

### STEP 2: Import libraries

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import MobileBertTokenizer, MobileBertForSequenceClassification, Trainer, TrainingArguments, MobileBertModel, default_data_collator
import torch.nn as nn


### STEP 3: Load your dataset

In [None]:
!pip install openpyxl

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Replace with your own file path or mount Google Drive to load your excel file
df = pd.read_excel("AllDeliveries2023.xlsx", engine="openpyxl")
df.head()

In [None]:
# Fill missing values
df['length'] = df['length'].fillna('None')
df['line'] = df['line'].fillna('None')

### STEP 4: Train-test split

In [None]:
from sklearn.preprocessing import LabelEncoder

# Split the data into train and validation sets
train_data, eval_data = train_test_split(df, test_size=0.2, random_state=42)

length_encoder = LabelEncoder()
line_encoder = LabelEncoder()

# Fit on both training and evaluation data for length
length_encoder.fit(pd.concat([train_data['length'], eval_data['length']]))
# Similarly for line
line_encoder.fit(pd.concat([train_data['line'], eval_data['line']]))

train_data['length'] = length_encoder.transform(train_data['length'])
train_data['line'] = line_encoder.transform(train_data['line'])

# Similarly for eval_data
eval_data['length'] = length_encoder.transform(eval_data['length'])
eval_data['line'] = line_encoder.transform(eval_data['line'])


In [None]:

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

### STEP 5: Tokenization and Encoding

In [None]:
from transformers import MobileBertTokenizer

# Load the MobileBERT tokenizer
tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['cb_comm'], padding='max_length', truncation=True, max_length=512)

# Apply tokenization to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Ensure the format of the datasets includes the tokenized inputs and labels
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'length', 'line'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'length', 'line'])


### STEP 6: Create Dual-Head Model

In [None]:
class DualHeadMobileBertForMultiLabelClassification(MobileBertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.classifier_length = nn.Linear(config.hidden_size, 1)  # Regression for length
        self.classifier_line = nn.Linear(config.hidden_size, 1)    # Regression for line

    def forward(self, input_ids=None, attention_mask=None, labels_length=None, labels_line=None):
        # Forward pass through MobileBERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]  # The last hidden state (sequence of hidden states)

        # Apply separate classifiers for length and line
        logits_length = self.classifier_length(sequence_output[:, 0, :])  # Length classifier
        logits_line = self.classifier_line(sequence_output[:, 0, :])    # Line classifier

        # Calculate the loss
        loss = None
        if labels_length is not None and labels_line is not None:
            # Using mean squared error loss for regression tasks
            loss_fct = nn.MSELoss()
            loss_length = loss_fct(logits_length.view(-1), labels_length)
            loss_line = loss_fct(logits_line.view(-1), labels_line)
            loss = (loss_length + loss_line) / 2  # Average the two losses

        # Return loss and logits
        return (loss, logits_length, logits_line) if loss is not None else (logits_length, logits_line)


### STEP 7: Model and training args

In [None]:
training_args = TrainingArguments(
    output_dir="./results",            # Output directory for model and logs
    eval_strategy="epoch",       # Evaluate every epoch
    save_strategy="epoch",             # Save model every epoch
    per_device_train_batch_size=16,    # Batch size for training
    per_device_eval_batch_size=16,     # Batch size for evaluation
    num_train_epochs=3,                # Number of training epochs
    learning_rate=2e-5,                # Learning rate
    weight_decay=0.01,                 # Weight decay
    logging_dir="./logs",              # Directory for logs
    logging_steps=100,                 # Log every 100 steps
    load_best_model_at_end=True,       # Load best model at the end of training
    metric_for_best_model="f1_micro_avg",  # Metric to determine best model
)


### STEP 8: Define metrics (optional but helpful)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits_length, logits_line = logits  # From the model output
    preds_length = np.argmax(logits_length, axis=-1)
    preds_line = np.argmax(logits_line, axis=-1)

    labels = np.array(labels)
    labels_length, labels_line = labels[:, 0], labels[:, 1]

    accuracy_length = accuracy_score(labels_length, preds_length)
    accuracy_line = accuracy_score(labels_line, preds_line)
    f1_length = f1_score(labels_length, preds_length, average="micro")
    f1_line = f1_score(labels_line, preds_line, average="micro")

    return {
        "accuracy_length": accuracy_length,
        "accuracy_line": accuracy_line,
        "f1_micro_length": f1_length,
        "f1_micro_line": f1_line,
        "f1_micro_avg": (f1_length + f1_line) / 2
    }


### STEP 9:  Initialize Trainer and Start Training

In [None]:
model = DualHeadMobileBert.from_pretrained(
    "google/mobilebert-uncased",
    num_labels_length=18,  # Number of classes for length
    num_labels_line=10     # Number of classes for line
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
)

# Start training
trainer.train()

### STEP 10: Evaluation

In [None]:
# Evaluate the model
results = trainer.evaluate(eval_dataset)

# Print results
print(f"Accuracy (Length): {results['eval_accuracy_length']}")
print(f"Accuracy (Line): {results['eval_accuracy_line']}")
print(f"F1 (Length): {results['eval_f1_micro_length']}")
print(f"F1 (Line): {results['eval_f1_micro_line']}")
print(f"Average F1: {results['eval_f1_micro_avg']}")