# Install Required Libraries

In [None]:
! pip install transformers datasets scikit-learn torch


# Import Libraries and Load Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

def load_dataset(file_path='spam.csv'):
    df = pd.read_csv(file_path, encoding='latin-1')
    df = df[['v1', 'v2']]  
    df.columns = ['label', 'message']  
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})  
    return df

df = load_dataset('spam.csv')
df.head()


# Tokenize the Data

In [None]:
from transformers import BertTokenizer

def tokenize_data(texts, tokenizer, max_length=256):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

def preprocess_data(df, tokenizer):
    train_texts, val_texts, train_labels, val_labels = train_test_split(df['message'], df['label'], test_size=0.2)
    train_encodings = tokenize_data(train_texts.tolist(), tokenizer)
    val_encodings = tokenize_data(val_texts.tolist(), tokenizer)

    return train_encodings, val_encodings, train_labels, val_labels

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings, val_encodings, train_labels, val_labels = preprocess_data(df, tokenizer)

train_encodings['input_ids'][0], train_labels.iloc[0]


# Create Dataset Class for PyTorch

In [None]:
import torch

class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SpamDataset(train_encodings, train_labels)
val_dataset = SpamDataset(val_encodings, val_labels)

train_dataset[0]


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

def load_model():
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def train_model(train_dataset, eval_dataset, model, tokenizer):
    training_args = TrainingArguments(
        output_dir='./results',          
        num_train_epochs=2,              
        per_device_train_batch_size=8,   
        per_device_eval_batch_size=16,   
        warmup_steps=500,                
        weight_decay=0.01,              
        logging_dir='./logs',            
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                 
        train_dataset=train_dataset,         
        eval_dataset=eval_dataset,          
        tokenizer=tokenizer                 
    )

    trainer.train()

model = load_model()

train_model(train_dataset, val_dataset, model, tokenizer)


# Evaluate the Model

In [None]:
def evaluate_model(trainer):
    results = trainer.evaluate()
    print(results)

trainer = Trainer(
    model=model,                         
    args=TrainingArguments(
        output_dir='./results',          
        num_train_epochs=3,             
        per_device_train_batch_size=8,  
        per_device_eval_batch_size=16,   
        warmup_steps=500,                
        weight_decay=0.01,               
        logging_dir='./logs',           
        logging_steps=10,
    ),
    train_dataset=train_dataset,        
    eval_dataset=val_dataset,            
    tokenizer=tokenizer                  
)

evaluate_model(trainer)


# Prediction Function

In [None]:
def predict(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()
    return 'spam' if prediction == 1 else 'ham'

test_message = "Free money! Claim your prize now."
print(f"Prediction: {predict(model, tokenizer, test_message)}")  