In [1]:
!gdown 1I9aPAvvYgQWdHGKtnd7IeTGXpx8vOm4h
!gdown 1-8TsrqTRFP-q9TM-6HinhO0ZVXFHq9TB
!gdown 1-AlW7oNJHaqi3xk_9dWHUS52Dzl_FmFW

Downloading...
From: https://drive.google.com/uc?id=1I9aPAvvYgQWdHGKtnd7IeTGXpx8vOm4h
To: /content/title_brand.csv
100% 97.3M/97.3M [00:03<00:00, 32.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-8TsrqTRFP-q9TM-6HinhO0ZVXFHq9TB
To: /content/test_data.csv
100% 15.6M/15.6M [00:00<00:00, 17.0MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1-AlW7oNJHaqi3xk_9dWHUS52Dzl_FmFW
From (redirected): https://drive.google.com/uc?id=1-AlW7oNJHaqi3xk_9dWHUS52Dzl_FmFW&confirm=t&uuid=4bdf8f4b-eafa-4479-8098-5be7fffe547e
To: /content/train_data.csv
100% 635M/635M [00:06<00:00, 99.5MB/s]


In [2]:
import pandas as pd
!pip install datasets transformers torch tqdm scikit-learn
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import torch
import time
import numpy as np
import multiprocessing



In [3]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_cpus = multiprocessing.cpu_count()
num_gpus = torch.cuda.device_count()
optimal_workers = min(num_cpus, num_gpus * 4) if num_gpus else num_cpus - 1

print(f'device: {device} CPU count: {num_cpus} GPU count: {num_gpus}  Workers count: {optimal_workers}')

# Model names
model_name = "bert-base-multilingual-cased"

# Data loading
df = pd.read_csv('train_data.csv')
train_df = df[0:45000]
val_df = df[45000:60000]
test_df = pd.read_csv('test_data.csv')

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


device: cpu CPU count: 2 GPU count: 0  Workers count: 1


  df = pd.read_csv('train_data.csv')


In [4]:
# Data Tokenization
tokenizer = AutoTokenizer.from_pretrained(model_name)

def truncate_and_tokenize(examples):
    max_length = 511
    truncated_texts = []
    for text in examples['reviewText']:
        if len(text) > max_length:
            truncated_text = text[-max_length:]  # Truncate from the beginning
        else:
            truncated_text = text
        truncated_texts.append(truncated_text)
    return tokenizer(truncated_texts, padding='max_length', truncation=True, max_length=max_length)

train_dataset = train_dataset.map(truncate_and_tokenize, batched=True)
val_dataset = val_dataset.map(truncate_and_tokenize, batched=True)
test_dataset = test_dataset.map(truncate_and_tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'overall'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'overall'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [5]:
# Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)

# Train
batch_size = 16
epochs = 5
logging_steps = 100
learning_rate = 5e-5

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Set gradient accumulation steps
gradient_accumulation_steps = 4  # Accumulate gradients over 4 batches

for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(range(len(train_dataloader)))
    total_loss = 0
    start_time = time.time()

    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['overall'])
        loss = outputs.loss / gradient_accumulation_steps  # Normalize loss for accumulation
        total_loss += loss.item()

        loss.backward()

        # Update weights only after accumulating gradients
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        progress_bar.update(1)

        if step % logging_steps == 0:
            elapsed_time = time.time() - start_time
            print(f"Epoch {epoch + 1}, Step {step}, Loss: {total_loss / (step + 1)}, Time elapsed: {elapsed_time}s")

    # Clear cache to free up memory
    torch.cuda.empty_cache()

    model.save_pretrained(f'model-epoch{epoch + 1}')

  0%|          | 0/2813 [00:00<?, ?it/s]

In [None]:
# Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['overall'])
            loss = outputs.loss
            val_loss += loss.item()

            predictions = outputs.log
            predictions = outputs.logits.argmax(dim=-1)
            correct_predictions += (predictions == batch['overall']).sum().item()
            total_predictions += predictions.size(0)

    val_accuracy = correct_predictions / total_predictions
    print(f"Validation Loss: {val_loss / len(val_dataloader)}, Validation Accuracy: {val_accuracy}")

# Save final model
model.save_pretrained('final_model')

# Testing
model.eval()
test_loss = 0
correct_predictions = 0
total_predictions = 0
prediction_list = []
ground_truth = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['overall'])
        loss = outputs.loss
        test_loss += loss.item()

        predictions = outputs.logits.argmax(dim=-1)
        prediction_list.append(predictions)
        ground_truth.append(batch['overall'])

        correct_predictions += (predictions == batch['overall']).sum().item()
        total_predictions += predictions.size(0)

test_accuracy = correct_predictions / total_predictions
print(f"Test Loss: {test_loss / len(test_dataloader)}, Test Accuracy: {test_accuracy}")

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

# Custom compute metrics function
def compute_metrics(true_labels, pred_labels):
    metrics = {
        'f1': f1_score(true_labels, pred_labels, average='macro'),
        'accuracy': accuracy_score(true_labels, pred_labels),
        'precision': precision_score(true_labels, pred_labels, average='macro'),
        'recall': recall_score(true_labels, pred_labels, average='macro'),
        'confusion_matrix': confusion_matrix(true_labels, pred_labels)
    }
    return metrics

# Concatenate ground truth and predictions for metric computation
metrics = compute_metrics(
    np.concatenate([tensor.flatten().cpu() for tensor in ground_truth]),
    np.concatenate([tensor.flatten().cpu() for tensor in prediction_list])
)

# Print the computed metrics
print(f"F1 Score: {metrics['f1']}")
print(f"Accuracy: {metrics['accuracy']}")
print(f"Precision: {metrics['precision']}")
print(f"Recall: {metrics['recall']}")
print(f"Confusion Matrix:\n{metrics['confusion_matrix']}")