In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load CSV as pandas DataFrame
df = pd.read_csv("final_dataset.csv")

# Convert pandas dataframe to HuggingFace dataset
dataset = Dataset.from_pandas(df)

# Load MobileBERT tokenizer (uncased)
tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")

# Tokenize function
def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

# Tokenize dataset
dataset = dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/5361 [00:00<?, ? examples/s]

In [3]:
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]


In [4]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=2)


pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",   # Evaluate at end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
)


model.safetensors:   0%|          | 0.00/147M [00:00<?, ?B/s]

In [6]:
from transformers import Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)


In [7]:
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpranjalpravesh121[0m ([33mpranjalpravesh121-iit-delhi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2,0.137225,0.963653,0.962751,0.958175,0.96737
2,0.1153,0.150541,0.972041,0.971645,0.957169,0.986564
3,0.0469,0.154433,0.972973,0.972512,0.960674,0.984645


TrainOutput(global_step=804, training_loss=11186.068376913518, metrics={'train_runtime': 181.8503, 'train_samples_per_second': 70.739, 'train_steps_per_second': 4.421, 'total_flos': 201670603407360.0, 'train_loss': 11186.068376913518, 'epoch': 3.0})

In [8]:
trainer.evaluate()


{'eval_loss': 0.15443302690982819,
 'eval_accuracy': 0.972972972972973,
 'eval_f1': 0.9725118483412323,
 'eval_precision': 0.9606741573033708,
 'eval_recall': 0.9846449136276392,
 'eval_runtime': 6.6329,
 'eval_samples_per_second': 161.768,
 'eval_steps_per_second': 10.252,
 'epoch': 3.0}

In [9]:
trainer.save_model("./mobilebert-finetuned-actionable")
tokenizer.save_pretrained("./mobilebert-finetuned-actionable")


('./mobilebert-finetuned-actionable/tokenizer_config.json',
 './mobilebert-finetuned-actionable/special_tokens_map.json',
 './mobilebert-finetuned-actionable/vocab.txt',
 './mobilebert-finetuned-actionable/added_tokens.json',
 './mobilebert-finetuned-actionable/tokenizer.json')

In [10]:
import shutil
from google.colab import files

# Define model directory
save_directory = "mobilebert-finetuned-actionable"

# Save model and tokenizer
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

# Zip the directory
shutil.make_archive(save_directory, 'zip', save_directory)

# Download the zip file
files.download(f"{save_directory}.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>