In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [5]:
! pip install transformers



In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import BartTokenizer, BartForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [7]:
# Step 1: Load and preprocess data
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'message'])
# Convert labels to numerical
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [8]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [10]:
# Step 2: Check dataset structure
print("Dataset features:", dataset['train'].features)
print("Sample example:", dataset['train'][0])


Dataset features: {'label': Value(dtype='int64', id=None), 'message': Value(dtype='string', id=None)}
Sample example: {'label': 1, 'message': 'Reply to win £100 weekly! Where will the 2006 FIFA World Cup be held? Send STOP to 87239 to end service'}


In [11]:
# Step 3: Tokenization with proper formatting
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

def preprocess_function(examples):
    # Remove return_tensors parameter - this is the key fix
    tokenized = tokenizer(
        examples['message'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

    # Add labels separately
    tokenized['labels'] = examples['label']
    return tokenized

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [12]:
# Apply preprocessing
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['message', 'label']
)

Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [18]:
def compute_metrics(pred):
    """
    Calculates and returns a dictionary of metrics (accuracy, f1, precision, recall)
    based on predictions and labels.

    Args:
        pred (EvalPrediction): Prediction object containing logits and labels.

    Returns:
        dict: Dictionary of metrics.
    """
    labels = pred.label_ids
    # Access the first element of the tuple to get the logits
    preds = pred.predictions[0].argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [19]:
# Initialize model
model = BartForSequenceClassification.from_pretrained(
    'facebook/bart-base',
    num_labels=2
)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    remove_unused_columns=True
)




In [21]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

In [22]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0653,0.045199,0.992825,0.973154,0.973154,0.973154
2,0.0136,0.063846,0.991031,0.966443,0.966443,0.966443
3,0.0001,0.06937,0.991928,0.969697,0.972973,0.966443


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


TrainOutput(global_step=1674, training_loss=0.04178468779334743, metrics={'train_runtime': 872.4379, 'train_samples_per_second': 15.326, 'train_steps_per_second': 1.919, 'total_flos': 2050357626455040.0, 'train_loss': 0.04178468779334743, 'epoch': 3.0})

In [23]:
results = trainer.evaluate()
print("Final evaluation results:")
print(results)

Final evaluation results:
{'eval_loss': 0.04519888013601303, 'eval_accuracy': 0.9928251121076234, 'eval_f1': 0.9731543624161074, 'eval_precision': 0.9731543624161074, 'eval_recall': 0.9731543624161074, 'eval_runtime': 21.6079, 'eval_samples_per_second': 51.602, 'eval_steps_per_second': 6.479, 'epoch': 3.0}


In [24]:
model.save_pretrained('./spam_classifier')
tokenizer.save_pretrained('./spam_classifier')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


('./spam_classifier/tokenizer_config.json',
 './spam_classifier/special_tokens_map.json',
 './spam_classifier/vocab.json',
 './spam_classifier/merges.txt',
 './spam_classifier/added_tokens.json')

In [27]:
def get_predictions(dataset):
    """
    Gets predictions for the given dataset.

    Args:
        dataset: The dataset to get predictions for.

    Returns:
        A NumPy array of predictions.
    """
    # Use trainer.predict to get predictions
    outputs = trainer.predict(dataset)

    # Get predictions for the specified class index (e.g., 0 for ham, 1 for spam).
    # This assumes the model outputs a tuple/list of predictions.
    preds = np.argmax(outputs.predictions[0], axis=1)  # Modified to access first element

    return preds

In [29]:
# Add this import at the beginning of your script
from sklearn.metrics import classification_report

# Get true labels - note the fix here, it should be 'labels' not 'label'
y_true = tokenized_datasets['test']['labels']
y_pred = get_predictions(tokenized_datasets['test'])

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['ham', 'spam']))


Classification Report:
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       966
        spam       0.97      0.97      0.97       149

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [34]:
# Add this import at the beginning of your script
from sklearn.metrics import classification_report
import torch.nn.functional as F  # Import for softmax
import torch # This line is added to import the torch module

def predict_spam(text, model, tokenizer):
    """
    Predicts whether a given text is spam or ham.

    Args:
        text: The text to predict.
        model: The trained model.
        tokenizer: The tokenizer used for the model.

    Returns:
        A tuple containing the prediction (spam or ham) and the confidence score.
    """
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()} # This line moves the inputs to the model's device

    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # Get probabilities using softmax
    probs = F.softmax(outputs.logits, dim=1)

    # Get predicted class
    prediction = "spam" if probs[0][1].item() > probs[0][0].item() else "ham"

    # Get confidence score
    confidence = probs[0][1].item() if prediction == "spam" else probs[0][0].item()

    return prediction, confidence

# ... (Rest of your code remains the same) ...

# Test with sample messages
sample_texts = [
    "WINNER!! You've been selected for a free prize. Click here to claim!",
    "Hey, do you want to grab lunch tomorrow?",
    "URGENT: Your bank account has been compromised. Call this number immediately.",
    "Reminder: We have a meeting at 3pm today"
]

print("\nSample Predictions:")
for text in sample_texts:
    prediction, confidence = predict_spam(text, model, tokenizer)
    print(f"Text: {text}")
    print(f"Prediction: {prediction} (Confidence: {confidence:.2f}%)")
    print("-" * 50)


Sample Predictions:
Text: WINNER!! You've been selected for a free prize. Click here to claim!
Prediction: spam (Confidence: 0.99%)
--------------------------------------------------
Text: Hey, do you want to grab lunch tomorrow?
Prediction: ham (Confidence: 1.00%)
--------------------------------------------------
Text: URGENT: Your bank account has been compromised. Call this number immediately.
Prediction: ham (Confidence: 0.99%)
--------------------------------------------------
Text: Reminder: We have a meeting at 3pm today
Prediction: ham (Confidence: 1.00%)
--------------------------------------------------
