In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [5]:
# Step 1: Download dataset
import kagglehub

path = kagglehub.dataset_download("sid321axn/malicious-urls-dataset")
print("Dataset downloaded at:", path)

# Step 2: Load and preprocess dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

# Load and sample small subset for fast training
df = pd.read_csv(f"{path}/malicious_phish.csv").dropna()
df = df.sample(n=2000, random_state=42)

# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['type'])

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(df['url'], df['label'], test_size=0.2, random_state=42)
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})

# Step 3: Tokenization using BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Step 4: Load small BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))

# Step 5: Training arguments (quick train)
training_args = TrainingArguments(
    output_dir="./results_fast_bert",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs_fast_bert",
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Step 6: Train
trainer.train()

# Step 7: Evaluate
preds_output = trainer.predict(test_dataset)
y_pred = torch.tensor(preds_output.predictions).argmax(axis=1)
print(classification_report(test_dataset['labels'], y_pred, target_names=le.classes_))


Dataset downloaded at: /root/.cache/kagglehub/datasets/sid321axn/malicious-urls-dataset/versions/1


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.380417
2,No log,0.305162


              precision    recall  f1-score   support

      benign       0.96      0.95      0.96       249
  defacement       0.81      0.98      0.89        60
     malware       0.93      0.61      0.74        23
    phishing       0.80      0.76      0.78        68

    accuracy                           0.91       400
   macro avg       0.88      0.83      0.84       400
weighted avg       0.91      0.91      0.90       400

