In [None]:
!pip list

In [1]:
pip install transformers datasets sacrebleu


[0m[31mERROR: Could not find a version that satisfies the requirement sacrebleu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sacrebleu[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import os

# List the files in the directory
model_path = "/kaggle/input/pretrained-filess/"
files = os.listdir(model_path)
print(files)


['config.json', 'tokenizer_config.json', 'pytorch_model.bin', 'README (2).md', 'vocab.txt']


In [4]:
from transformers import BertForSequenceClassification, BertTokenizer, pipeline

# Specify the path where the model files are located
model_path = "/kaggle/input/pretrained-filess/"  # Path to the folder containing the model files

# Load the model and tokenizer from the local directory
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Specify the device (use 0 for GPU or -1 for CPU)
device = 0  # Set to 0 for GPU, -1 for CPU

# Create a text classification pipeline with the specified device
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)

# Example SMS messages
sms = [
    "Congratulations! You've won a $1,000 Walmart gift card. Go to http://spam.com to claim now.",
    "Hi, are we still on for dinner tomorrow night?"
]

# Label mapping
label_map = {
    "LABEL_0": "spam",
    "LABEL_1": "ham"
}

# Get predictions
predictions = classifier(sms)

# Print predictions with human-readable labels
for i, message in enumerate(sms):
    predicted_label = label_map[predictions[i]["label"]]
    score = predictions[i]["score"]
    print(f"Message: {message}")
    print(f"Prediction: {predicted_label} (confidence: {score:.2f})\n")


Message: Congratulations! You've won a $1,000 Walmart gift card. Go to http://spam.com to claim now.
Prediction: spam (confidence: 0.79)

Message: Hi, are we still on for dinner tomorrow night?
Prediction: spam (confidence: 0.94)



In [4]:
pip install transformers datasets

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.54-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Downloading SQLAlchemy-1.4.54-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: banal, sqlalchemy, dataset
  Attempting uninstall: sqlalchemy
    Found existing installation: SQLAlchemy 2.0.30
    Uninstalling SQLAlchemy-2.0.30:
      Successfully uninstalled SQLAlchemy-2.0.

In [5]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# 1. Load Dataset
data_path = "/kaggle/input/cleaned/cleaned_Spam_SMS.csv"
df = pd.read_csv(data_path)

# Preprocess dataset
texts = df["Message"].tolist()
labels = [1 if label == "spam" else 0 for label in df["Class"].tolist()]  # Spam = 1, Ham = 0

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# 2. Tokenize Dataset
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = preprocess_function(train_texts)
val_encodings = preprocess_function(val_texts)

# 3. Create Dataset Class
class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = SpamDataset(train_encodings, train_labels)
val_dataset = SpamDataset(val_encodings, val_labels)

# 4. Load Pre-trained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# 5. Define Evaluation Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# 6. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",           # Directory to save model checkpoints
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model after each epoch
    learning_rate=5e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay for regularization
    logging_dir="./logs",            # Directory to save logs
    logging_steps=10,                # Log every 10 steps
    load_best_model_at_end=True,     # Load best model at the end of training
    metric_for_best_model="f1",      # Metric to determine the best model
    save_total_limit=2,              # Keep only 2 checkpoints
    report_to="none",                # Disable W&B or other integrations
)

# 7. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 8. Train and Save the Model
trainer.train()
trainer.save_model("./spam_classifier_model")
tokenizer.save_pretrained("./spam_classifier_model")

# 9. Evaluate Model
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# 10. Test the Model on New Examples
# 10. Test the Model on New Examples
test_sms = [
    "Congratulations! You've won a $1,000 Walmart gift card. Go to http://spam.com to claim now.",
    "Hi, are we still on for dinner tomorrow night?",
]

# Tokenize the test messages
test_encodings = tokenizer(test_sms, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Move the encodings and model to the same device (GPU or CPU)
device = model.device  # Get the device of the model (GPU or CPU)
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}  # Move inputs to the same device

# Make predictions
output = model(**test_encodings)
predictions = torch.argmax(output.logits, dim=-1).tolist()

# Map predictions to labels
label_map = {0: "ham", 1: "spam"}
for i, sms in enumerate(test_sms):
    print(f"Message: {sms}")
    print(f"Prediction: {label_map[predictions[i]]}")



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.048,0.031899,0.994619,0.987421,0.975155,0.98125
2,0.0267,0.01798,0.997309,0.99375,0.987578,0.990654
3,0.0005,0.016966,0.997309,0.99375,0.987578,0.990654


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation Metrics: {'eval_loss': 0.017980312928557396, 'eval_accuracy': 0.9973094170403587, 'eval_precision': 0.99375, 'eval_recall': 0.9875776397515528, 'eval_f1': 0.9906542056074766, 'eval_runtime': 5.8201, 'eval_samples_per_second': 191.578, 'eval_steps_per_second': 6.014, 'epoch': 3.0}
Message: Congratulations! You've won a $1,000 Walmart gift card. Go to http://spam.com to claim now.
Prediction: spam
Message: Hi, are we still on for dinner tomorrow night?
Prediction: ham


In [6]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("./spam_classifier_model")
tokenizer = BertTokenizer.from_pretrained("./spam_classifier_model")

# Function to make a prediction on a custom input
def predict_spam(input_texts):
    # Tokenize the input text
    encodings = tokenizer(input_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    
    # Run the model on the input text
    with torch.no_grad():
        outputs = model(**encodings)
    
    # Get the predictions
    predictions = torch.argmax(outputs.logits, dim=-1).tolist()
    
    # Map predictions to labels
    label_map = {0: "ham", 1: "spam"}
    for i, text in enumerate(input_texts):
        print(f"Message: {text}")
        print(f"Prediction: {label_map[predictions[i]]}")

# Example usage with custom input
custom_texts = [
    "You've won a free vacation! Claim your prize now.",
    "Congratulation on your win today at the match"
]

predict_spam(custom_texts)



Message: You've won a free vacation! Claim your prize now.
Prediction: ham
Message: Congratulation on your win today at the match
Prediction: ham


In [6]:
pip install gradio transformers torch


  pid, fd = os.forkpty()


Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.2 (from gradio)
  Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting starlette<1.0,>=0.40.0 (from gradio)
  Downloading starlette-0.42.0-py3-none-any.whl.metadata (6

In [7]:
import gradio as gr
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("./spam_classifier_model")
tokenizer = BertTokenizer.from_pretrained("./spam_classifier_model")

# Define the prediction function
def classify_spam(input_text):
    # Tokenize the input text
    encodings = tokenizer(input_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    
    # Run the model on the input text
    with torch.no_grad():
        outputs = model(**encodings)
    
    # Get the predictions and confidence (softmax output)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    confidence, prediction = torch.max(probabilities, dim=-1)
    
    # Map predictions to labels
    label_map = {0: "Not spam", 1: "Spam"}
    result = label_map[prediction.item()]
    
    # Return result with confidence percentage
    return f"{result} (Confidence: {confidence.item() * 100:.2f}%)"

# Create Gradio interface
iface = gr.Interface(
    fn=classify_spam,  # Function to be called
    inputs=gr.Textbox(label="Enter your message"),  # Input component for user to type message
    outputs=gr.Textbox(label="Prediction Result"),  # Output component to show prediction
    live=True,  # Update output in real-time
    title="Spam Message Classifier",  # Title of the app
    description="Enter a message to check if it's spam or not along with the confidence score.",  # Description
)

# Launch the app
iface.launch()


* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://b7bbcfd941cc4907f8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


