<a href="https://colab.research.google.com/github/sanisammani/Chatbot-/blob/main/Scamdetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
from torch.utils.data import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import wandb

# Initialize Weights & Biases
wandb.init(project="t5-scam-detection", name="scam-detection-run")

# Detect device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset and check available columns
def load_data(file_path):
    df = pd.read_csv(file_path)
    print("Dataset Columns:", df.columns)  # Debugging line to check column names
    return df

def preprocess_data(df):
    text_col = 'input' if 'input' in df.columns else df.columns[0]
    label_col = 'output' if 'output' in df.columns else df.columns[1]
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df[text_col].tolist(), df[label_col].tolist(), test_size=0.2
    )
    return train_texts, val_texts, train_labels, val_labels

# Tokenizer
MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(texts, labels):
    inputs = tokenizer(
        [f"detect scam: {text}" for text in texts],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )
    labels = tokenizer(
        labels, padding="max_length", truncation=True, max_length=128, return_tensors="pt"
    ).input_ids
    labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss calculation

    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels}

# Custom Dataset Class
class ScamDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

# Load data
df = load_data("scamdata.csv")  # Updated dataset path
train_texts, val_texts, train_labels, val_labels = preprocess_data(df)

# Tokenize data
train_encodings = tokenize_function(train_texts, train_labels)
val_encodings = tokenize_function(val_texts, val_labels)

# Convert to Dataset
train_dataset = ScamDataset(train_encodings)
val_dataset = ScamDataset(val_encodings)

# Load model
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(device)  # Move model to GPU/CPU

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="wandb",
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./t5_scam_detector")
tokenizer.save_pretrained("./t5_scam_detector")

# Inference Function
def predict_scam(text):
    input_text = f"detect scam: {text}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    output_ids = model.generate(input_ids)
    prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return prediction

# Evaluation Metrics
def compute_metrics():
    predictions = []
    references = []

    for text, label in zip(val_texts, val_labels):
        pred = predict_scam(text)
        predictions.append(pred)
        references.append(label)

    accuracy = accuracy_score(references, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(references, predictions, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Run Evaluation
metrics = compute_metrics()
wandb.log(metrics)


Dataset Columns: Index(['output', 'input'], dtype='object')




Epoch,Training Loss,Validation Loss
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0


Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000
