In [2]:
from huggingface_hub import login

# Replace with your Hugging Face token
login("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")


In [7]:
import pandas as pd
from sklearn.metrics import classification_report
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load dataset
df = pd.read_csv("suspicious_conversations_dataset.csv")

# Map labels to numeric values (if not already)
label_map = {"Safe": 0, "Suspicious": 1, "Highly Suspicious": 2}
if df["Label"].dtype == object:
    df["Label"] = df["Label"].map(label_map)

# Split dataset
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["Label"], random_state=42)

# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch["Message"], padding=True, truncation=True)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Ensure the labels are properly included and formatted as 'labels'
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "Label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "Label"])

# Rename the "Label" column to "labels" as required by the model
train_dataset = train_dataset.rename_column("Label", "labels")
test_dataset = test_dataset.rename_column("Label", "labels")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model on the test set
preds = trainer.predict(test_dataset)

# Get predictions
y_pred = torch.argmax(torch.tensor(preds.predictions), dim=1)

# Get true labels from the test dataset
y_true = test_df["Label"].values

# Print classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["Safe", "Suspicious", "Highly Suspicious"]))

# Save the trained model and tokenizer
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

print("Model saved successfully!")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 400/400 [00:00<00:00, 15845.50 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 16736.38 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.1409,0.063891
2,0.0091,0.006214
3,0.006,0.004604



Classification Report:
                   precision    recall  f1-score   support

             Safe       1.00      1.00      1.00        73
       Suspicious       1.00      1.00      1.00        18
Highly Suspicious       1.00      1.00      1.00         9

         accuracy                           1.00       100
        macro avg       1.00      1.00      1.00       100
     weighted avg       1.00      1.00      1.00       100

Model saved successfully!
