In [None]:
!pip install transformers datasets scikit-learn torch -q


## Libraries

In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

## Data Processing

In [None]:
dataset_url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(dataset_url, sep='\t', header=None, names=['label', 'text'])
df['label'] = df['label'].map({'ham':0, 'spam':1})
df

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


## Splitting Data into Train / Validation / Test


In [None]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

val_dataset  = Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()})


## Tokenization

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

val_dataset = val_dataset.map(tokenize, batched=True)
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/836 [00:00<?, ? examples/s]

## Loading Base Model

In [None]:
# Base Model
base_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Evaluation Metrices

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}


## Testing Baseline in the data before fine tuning
This process is to get the results before and after

In [None]:
trainer_baseline = Trainer(
    model=base_model,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

baseline_results = trainer_baseline.evaluate()
print("Baseline Validation Results (Before Fine-tuning):")
print(baseline_results)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Baseline Validation Results (Before Fine-tuning):
{'eval_loss': 0.6743876338005066, 'eval_model_preparation_time': 0.0023, 'eval_accuracy': 0.8133971291866029, 'eval_precision': 0.21052631578947367, 'eval_recall': 0.14285714285714285, 'eval_f1': 0.1702127659574468, 'eval_runtime': 5.0968, 'eval_samples_per_second': 164.025, 'eval_steps_per_second': 20.601}


## Fine-Tuning

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

dataset_url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(dataset_url, sep='\t', header=None, names=['label', 'text'])
df['label'] = df['label'].map({'ham':0, 'spam':1})

train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
val_dataset   = Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()})
test_dataset  = Dataset.from_dict({'text': test_texts.tolist(), 'label': test_labels.tolist()})


In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset   = val_dataset.map(tokenize, batched=True)
test_dataset  = test_dataset.map(tokenize, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/3900 [00:00<?, ? examples/s]

Map:   0%|          | 0/836 [00:00<?, ? examples/s]

Map:   0%|          | 0/836 [00:00<?, ? examples/s]

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

for param in model.distilbert.parameters():
    param.requires_grad = False


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    do_train=True,
    do_eval=True,
    logging_dir='./logs',
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_strategy="no"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()


Step,Training Loss
500,0.1453
1000,0.0573


TrainOutput(global_step=1220, training_loss=0.09227654347654249, metrics={'train_runtime': 75.303, 'train_samples_per_second': 258.954, 'train_steps_per_second': 16.201, 'total_flos': 645778568448000.0, 'train_loss': 0.09227654347654249, 'epoch': 5.0})

In [None]:
results = trainer.evaluate(test_dataset)
print("Test Set Results (After Fine-tuning):")
print(results)

Test Set Results (After Fine-tuning):
{'eval_loss': 0.04177936166524887, 'eval_accuracy': 0.9880382775119617, 'eval_precision': 0.9553571428571429, 'eval_recall': 0.9553571428571429, 'eval_f1': 0.9553571428571429, 'eval_runtime': 3.2394, 'eval_samples_per_second': 258.069, 'eval_steps_per_second': 16.361, 'epoch': 5.0}


In [None]:
import pandas as pd

results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-score', 'Loss'],
    'Baseline': [
        baseline_results.get('eval_accuracy', 0),
        baseline_results.get('eval_precision', 0),
        baseline_results.get('eval_recall', 0),
        baseline_results.get('eval_f1', 0),
        baseline_results.get('eval_loss', 0)
    ],
    'Fine-tuned': [
        results.get('eval_accuracy', 0),
        results.get('eval_precision', 0),
        results.get('eval_recall', 0),
        results.get('eval_f1', 0),
        results.get('eval_loss', 0)
    ]
})

results_df


Unnamed: 0,Metric,Baseline,Fine-tuned
0,Accuracy,0.813397,0.988038
1,Precision,0.210526,0.955357
2,Recall,0.142857,0.955357
3,F1-score,0.170213,0.955357
4,Loss,0.674388,0.041779


## Optimizer Improvements

In [None]:
model_path = "./spam-detector-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

print(f"Model Saved IN {model_path}")

Model Saved IN ./spam-detector-model


In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np

def run_optimizer_experiment(opt_name, lr, extra_kwargs={}):
    print(f"\n=== Running {opt_name} | lr={lr} ===")


    model_exp = DistilBertForSequenceClassification.from_pretrained(
        "./spam-detector-model"
    )


    for param in model_exp.distilbert.parameters():
        param.requires_grad = False

    training_args_exp = TrainingArguments(
        output_dir="./opt_results",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=lr,
        logging_steps=20,
        save_strategy="no"
    )

    trainer_exp = Trainer(
        model=model_exp,
        args=training_args_exp,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    if opt_name == "AdamW":
        from torch.optim import AdamW
        optimizer = AdamW(model_exp.parameters(), lr=lr, **extra_kwargs)

    elif opt_name == "RMSProp":
        from torch.optim import RMSprop
        optimizer = RMSprop(model_exp.parameters(), lr=lr, **extra_kwargs)

    else:
        raise ValueError("Optimizer not supported")

    trainer_exp.optimizer = optimizer

    trainer_exp.train()
    results = trainer_exp.evaluate()
    print(results)
    return results


In [None]:
optimizer_results = {}

optimizer_results['AdamW_lr5e-5']   = run_optimizer_experiment("AdamW", 5e-5)




=== Running AdamW | lr=5e-05 ===


Step,Training Loss
20,0.0382
40,0.0872
60,0.0541
80,0.0334
100,0.0319
120,0.0574
140,0.054
160,0.0381
180,0.042
200,0.0305


{'eval_loss': 0.03882070630788803, 'eval_accuracy': 0.9880382775119617, 'eval_precision': 0.9811320754716981, 'eval_recall': 0.9285714285714286, 'eval_f1': 0.9541284403669725, 'eval_runtime': 2.8658, 'eval_samples_per_second': 291.716, 'eval_steps_per_second': 18.494, 'epoch': 3.0}


In [None]:
optimizer_results['AdamW_lr1e-4']   = run_optimizer_experiment("AdamW", 1e-4)



=== Running AdamW | lr=0.0001 ===


Step,Training Loss
20,0.0385
40,0.086
60,0.0533
80,0.0306
100,0.0313
120,0.0557
140,0.0534
160,0.0386
180,0.0395
200,0.0278


{'eval_loss': 0.03578798100352287, 'eval_accuracy': 0.9904306220095693, 'eval_precision': 1.0, 'eval_recall': 0.9285714285714286, 'eval_f1': 0.9629629629629629, 'eval_runtime': 2.8441, 'eval_samples_per_second': 293.947, 'eval_steps_per_second': 18.635, 'epoch': 3.0}


In [None]:
optimizer_results['RMSProp_lr5e-5'] = run_optimizer_experiment("RMSProp", 5e-5)



=== Running RMSProp | lr=5e-05 ===


Step,Training Loss
20,0.0432
40,0.0864
60,0.0532
80,0.0315
100,0.0308
120,0.0569
140,0.0522
160,0.0387
180,0.041
200,0.029


{'eval_loss': 0.03771838918328285, 'eval_accuracy': 0.9880382775119617, 'eval_precision': 0.9811320754716981, 'eval_recall': 0.9285714285714286, 'eval_f1': 0.9541284403669725, 'eval_runtime': 2.8785, 'eval_samples_per_second': 290.433, 'eval_steps_per_second': 18.413, 'epoch': 3.0}


In [None]:
optimizer_results['RMSProp_lr1e-4'] = run_optimizer_experiment("RMSProp", 1e-4)


=== Running RMSProp | lr=0.0001 ===


Step,Training Loss
20,0.0534
40,0.0887
60,0.0539
80,0.0314
100,0.0301
120,0.0576
140,0.0512
160,0.0406
180,0.039
200,0.0266


{'eval_loss': 0.03435274586081505, 'eval_accuracy': 0.9904306220095693, 'eval_precision': 1.0, 'eval_recall': 0.9285714285714286, 'eval_f1': 0.9629629629629629, 'eval_runtime': 2.7762, 'eval_samples_per_second': 301.13, 'eval_steps_per_second': 19.091, 'epoch': 3.0}


In [None]:
import pandas as pd

comparison_table = pd.DataFrame([
    {
        "Optimizer": name,
        "Accuracy": results.get("eval_accuracy", 0),
        "Precision": results.get("eval_precision", 0),
        "Recall": results.get("eval_recall", 0),
        "F1 Score": results.get("eval_f1", 0),
        "Loss": results.get("eval_loss", 0)
    }
    for name, results in optimizer_results.items()
])

print(comparison_table)


        Optimizer  Accuracy  Precision    Recall  F1 Score      Loss
0    AdamW_lr5e-5  0.988038   0.981132  0.928571  0.954128  0.038821
1    AdamW_lr1e-4  0.990431   1.000000  0.928571  0.962963  0.035788
2  RMSProp_lr5e-5  0.988038   0.981132  0.928571  0.954128  0.037718
3  RMSProp_lr1e-4  0.990431   1.000000  0.928571  0.962963  0.034353


## Learning Rate improvements

In [None]:
def run_lr_experiment(lr_value):
    print(f"\n=== Running AdamW with Learning Rate = {lr_value} ===")


    model_lr = DistilBertForSequenceClassification.from_pretrained(
        "./spam-detector-model"
    )


    for param in model_lr.distilbert.parameters():
        param.requires_grad = False

    training_args_lr = TrainingArguments(
        output_dir="./lr_results",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=lr_value,
        logging_steps=20,
        save_strategy="no"
    )

    trainer_lr = Trainer(
        model=model_lr,
        args=training_args_lr,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )


    from torch.optim import AdamW
    optimizer = AdamW(model_lr.parameters(), lr=lr_value)
    trainer_lr.optimizer = optimizer

    trainer_lr.train()
    results = trainer_lr.evaluate()
    print(results)
    return results



lr_results = {}



In [None]:
lr_results["AdamW_lr3e-5"] = run_lr_experiment(3e-5)


=== Running AdamW with Learning Rate = 3e-05 ===


Step,Training Loss
20,0.0384
40,0.0869
60,0.056
80,0.0338
100,0.0331
120,0.0579
140,0.0542
160,0.0384
180,0.0432
200,0.0319


{'eval_loss': 0.04109153151512146, 'eval_accuracy': 0.9856459330143541, 'eval_precision': 0.9629629629629629, 'eval_recall': 0.9285714285714286, 'eval_f1': 0.9454545454545454, 'eval_runtime': 2.757, 'eval_samples_per_second': 303.231, 'eval_steps_per_second': 19.224, 'epoch': 3.0}


In [None]:
lr_results["AdamW_lr1e-5"] = run_lr_experiment(1e-5)




=== Running AdamW with Learning Rate = 1e-05 ===


Step,Training Loss
20,0.0392
40,0.0849
60,0.0573
80,0.0332
100,0.0345
120,0.0586
140,0.055
160,0.0391
180,0.0448
200,0.0335


{'eval_loss': 0.044714029878377914, 'eval_accuracy': 0.9856459330143541, 'eval_precision': 0.9629629629629629, 'eval_recall': 0.9285714285714286, 'eval_f1': 0.9454545454545454, 'eval_runtime': 2.8326, 'eval_samples_per_second': 295.132, 'eval_steps_per_second': 18.711, 'epoch': 3.0}


In [None]:
lr_table = pd.DataFrame([
    {
        "Learning Rate": name,
        "Accuracy": results.get("eval_accuracy", 0),
        "Precision": results.get("eval_precision", 0),
        "Recall": results.get("eval_recall", 0),
        "F1 Score": results.get("eval_f1", 0),
        "Loss": results.get("eval_loss", 0)
    }
    for name, results in lr_results.items()
])

print("\n=== Learning Rate Comparison Table ===\n")
print(lr_table)



=== Learning Rate Comparison Table ===

  Learning Rate  Accuracy  Precision    Recall  F1 Score      Loss
0  AdamW_lr1e-5  0.985646   0.962963  0.928571  0.945455  0.044714
1  AdamW_lr3e-5  0.985646   0.962963  0.928571  0.945455  0.041092


## Data Augmentation improvement

In [None]:
!pip install nlpaug -q
!pip install nltk -q

import nlpaug.augmenter.word as naw
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

from datasets import Dataset


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
syn_aug = naw.SynonymAug(aug_src='wordnet')

aug_texts = []
aug_labels = []

for text, label in zip(train_texts[:300], train_labels[:300]):
    augmented = syn_aug.augment(text)

    if isinstance(augmented, list):
        augmented = ' '.join(augmented)
    aug_texts.append(augmented)
    aug_labels.append(label)


final_train_texts  = list(train_texts) + aug_texts
final_train_labels = list(train_labels) + aug_labels


train_dataset_aug = Dataset.from_dict({
    'text': final_train_texts,
    'label': final_train_labels
})


train_dataset_aug = train_dataset_aug.map(tokenize, batched=True)
train_dataset_aug.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("✅ Synonym Augmentation Dataset created successfully!")
print(f"Original train size: {len(train_texts)}, Augmented train size: {len(final_train_texts)}")


Map:   0%|          | 0/4200 [00:00<?, ? examples/s]

✅ Synonym Augmentation Dataset created successfully!
Original train size: 3900, Augmented train size: 4200


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model_aug1 = DistilBertForSequenceClassification.from_pretrained(
    "./spam-detector-model",
    num_labels=2
)

for param in model_aug1.distilbert.parameters():
    param.requires_grad = False

training_args_aug = TrainingArguments(
    output_dir='./aug_results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    logging_dir='./logs',
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_strategy="no"
)

trainer_aug1 = Trainer(
    model=model_aug1,
    args=training_args_aug,
    train_dataset=train_dataset_aug,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer_aug1.train()

aug1_results = trainer_aug1.evaluate(test_dataset)

print("✅ Synonym Augmentation Results on Test Set:")
print(aug1_results)


Step,Training Loss
500,0.0467


✅ Synonym Augmentation Results on Test Set:
{'eval_loss': 0.03353264555335045, 'eval_accuracy': 0.992822966507177, 'eval_precision': 0.9907407407407407, 'eval_recall': 0.9553571428571429, 'eval_f1': 0.9727272727272728, 'eval_runtime': 3.3712, 'eval_samples_per_second': 247.981, 'eval_steps_per_second': 15.721, 'epoch': 3.0}


## Dropout

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments


model_dropout = DistilBertForSequenceClassification.from_pretrained(
    "./spam-detector-model",
    num_labels=2
)

for param in model_dropout.distilbert.parameters():
    param.requires_grad = False


model_dropout.dropout = torch.nn.Dropout(p=0.3)


training_args_dropout = TrainingArguments(
    output_dir='./dropout_results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    logging_dir='./logs',
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_strategy="no"
)


trainer_dropout = Trainer(
    model=model_dropout,
    args=training_args_dropout,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


trainer_dropout.train()


dropout_results = trainer_dropout.evaluate(test_dataset)
print("Dropout Modification Results (p=0.3) on Test Set:")
print(dropout_results)


Step,Training Loss
500,0.0451
1000,0.0426


✅ Dropout Modification Results (p=0.3) on Test Set:
{'eval_loss': 0.03150646761059761, 'eval_accuracy': 0.9940191387559809, 'eval_precision': 0.9908256880733946, 'eval_recall': 0.9642857142857143, 'eval_f1': 0.9773755656108597, 'eval_runtime': 2.7461, 'eval_samples_per_second': 304.433, 'eval_steps_per_second': 19.3, 'epoch': 5.0}


In [None]:
# Testing on New Data.

## Transfer Learning

In [None]:
!pip install transformers datasets scikit-learn torch -q

import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import requests, zipfile, io





In [None]:
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

r = requests.get(dataset_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
df_new = pd.read_csv(z.open('SMSSpamCollection'), sep='\t', header=None, names=['label','text'])
df_new['label'] = df_new['label'].map({'ham':0, 'spam':1})
print(f"New dataset size: {len(df_new)}")
df_new.tail()

New dataset size: 5572


Unnamed: 0,label,text
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...
5571,0,Rofl. Its true to its name


In [None]:
val_texts_new, test_texts_new, val_labels_new, test_labels_new = train_test_split(
    df_new['text'], df_new['label'], test_size=0.5, random_state=42
)

val_dataset_new = Dataset.from_dict({'text': val_texts_new.tolist(), 'label': val_labels_new.tolist()})
test_dataset_new = Dataset.from_dict({'text': test_texts_new.tolist(), 'label': test_labels_new.tolist()})


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("./spam-detector-model")

def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)

val_dataset_new = val_dataset_new.map(tokenize, batched=True)
test_dataset_new = test_dataset_new.map(tokenize, batched=True)

val_dataset_new.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset_new.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/2786 [00:00<?, ? examples/s]

Map:   0%|          | 0/2786 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}


In [None]:
model_new = DistilBertForSequenceClassification.from_pretrained("./spam-detector-model")
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

for param in model_new.distilbert.parameters():
    param.requires_grad = False

training_args_new = TrainingArguments(
    output_dir='./fine_tune_new',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    logging_dir='./logs_new',
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_strategy="no"
)

trainer_new = Trainer(
    model=model_new,
    args=training_args_new,
    train_dataset=val_dataset_new,
    eval_dataset=val_dataset_new,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer_new.train()


Step,Training Loss
500,0.0516


TrainOutput(global_step=525, training_loss=0.05107744773228963, metrics={'train_runtime': 33.7666, 'train_samples_per_second': 247.523, 'train_steps_per_second': 15.548, 'total_flos': 276790629491712.0, 'train_loss': 0.05107744773228963, 'epoch': 3.0})

In [None]:
results_new = trainer_new.evaluate(test_dataset_new)
print("Results on New Test Set:")
print(results_new)

Results on New Test Set:
{'eval_loss': 0.03652513399720192, 'eval_accuracy': 0.9877961234745154, 'eval_precision': 0.9640883977900553, 'eval_recall': 0.9432432432432433, 'eval_f1': 0.953551912568306, 'eval_runtime': 9.5235, 'eval_samples_per_second': 292.54, 'eval_steps_per_second': 18.376, 'epoch': 3.0}


In [None]:
import pandas as pd


results_new = trainer_new.evaluate(test_dataset_new)


results_table = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-score', 'Loss'],
    'Value': [
        results_new.get('eval_accuracy', 0),
        results_new.get('eval_precision', 0),
        results_new.get('eval_recall', 0),
        results_new.get('eval_f1', 0),
        results_new.get('eval_loss', 0)
    ]
})

print("Results on New Test Set:")
print(results_table)


Results on New Test Set:
      Metric     Value
0   Accuracy  0.987796
1  Precision  0.964088
2     Recall  0.943243
3   F1-score  0.953552
4       Loss  0.036525
