In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [3]:
# Step 1: Data Preparation
data = pd.read_csv('phishing_site_urls.csv')


In [4]:
missing_values = data.isnull().sum()
print(missing_values)



URL      0
Label    0
dtype: int64


In [5]:
data = data.dropna()



In [6]:
X = data['URL']  # URL text data
y = data['Label']  # Binary labels (bad or good)



In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [8]:
# Step 2: Ensemble Learning
# Convert text data into numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



In [9]:
# Train a Random Forest classifier (you can replace with SVM if needed)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)



In [10]:
# Convert string labels to numerical labels using the same label encoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)



In [11]:
# Make predictions using the trained classifier
rf_predictions = rf_classifier.predict(X_test_tfidf)



In [12]:
# Debugging: Print out some of the predicted labels
print("Sample Predicted Labels (Random Forest):", rf_predictions[:10])



Sample Predicted Labels (Random Forest): ['bad' 'bad' 'bad' 'bad' 'good' 'bad' 'good' 'bad' 'bad' 'bad']


In [13]:
# Convert the Random Forest predictions to numerical labels
rf_predictions_encoded = label_encoder.transform(rf_predictions)



In [14]:
# Calculate accuracy using numerical labels
rf_accuracy = accuracy_score(y_test_encoded, rf_predictions_encoded)
print(f"Random Forest Classifier Accuracy: {rf_accuracy:.2f}")


Random Forest Classifier Accuracy: 0.96


In [15]:
# Step 3: Feature Extraction with BERT
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')



Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [16]:
# Define a maximum sequence length (adjust as needed)
max_seq_length = 128



In [17]:
# Ensure X_train is a list of strings
X_train = X_train.astype(str).tolist()



In [18]:
# Tokenize and convert text data to BERT embeddings
tokenized_texts = [tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in X_train]
input_ids = torch.cat([text['input_ids'] for text in tokenized_texts], dim=0)
attention_mask = torch.cat([text['attention_mask'] for text in tokenized_texts], dim=0)
labels = torch.tensor(y_train_encoded)  # Use the encoded labels



In [24]:
# Create DataLoader for training
dataset = TensorDataset(input_ids, attention_mask, labels)
#dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

dataloader = DataLoader(dataset, batch_size=8, shuffle=True)  # Reduce batch size


In [25]:
# Define a simple CNN model for fine-tuning with BERT features
import torch.nn.functional as F

class CNNWithBERT(nn.Module):
    def __init__(self):
        super(CNNWithBERT, self).__init__()
        self.bert = bert_model
        self.conv2d = nn.Conv2d(in_channels=1, out_channels=128, kernel_size=(3, 768))
        self.fc1 = nn.Linear(128, 2)  # 2 output classes

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state
        pooled_output = pooled_output.unsqueeze(1)  # Add a channel dimension
        x = F.relu(self.conv2d(pooled_output))
        x = x.squeeze(3)  # Remove the last dimension (it should be 1)
        x = F.max_pool1d(x, kernel_size=x.size(2))  # Apply max-pooling across the sequence dimension
        x = x.squeeze(2)  # Remove the sequence dimension (it should be 1)
        x = self.fc1(x)
        return x

In [26]:
# Define an instance of the CNNWithBERT model and the optimizer
cnn_model = CNNWithBERT()
optimizer = optim.Adam(cnn_model.parameters(), lr=1e-3)

In [27]:
# Define loss function and optimizer
optimizer = optim.Adam(cnn_model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [28]:
# Train the CNN model with gradient accumulation
num_epochs = 5
gradient_accumulation_steps = 4  # Accumulate gradients over 4 batches

for epoch in range(num_epochs):
    cnn_model.train()
    total_loss = 0.0
    accumulated_batches = 0  # Initialize the count of accumulated batches

    for i, batch in enumerate(dataloader):
        input_ids_batch, attention_mask_batch, labels_batch = batch
        optimizer.zero_grad()
        outputs = cnn_model(input_ids_batch, attention_mask_batch)
        loss = criterion(outputs, labels_batch)

        # Scale the loss to account for gradient accumulation
        loss = loss / gradient_accumulation_steps

        loss.backward()
        total_loss += loss.item()
        accumulated_batches += 1

        # Perform weight update after accumulating gradients for gradient_accumulation_steps batches
        if accumulated_batches == gradient_accumulation_steps:
            optimizer.step()
            optimizer.zero_grad()
            accumulated_batches = 0

    # Print the average loss for the epoch
    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}, Average Loss: {average_loss:.4f}")

# Don't forget to update the learning rate scheduler if you're using one
scheduler.step(average_loss)


KeyboardInterrupt: ignored

In [1]:
# Load the test data and tokenize it
X_test = X_test.astype(str).tolist()
tokenized_test_texts = [tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in X_test]
input_ids_test = torch.cat([text['input_ids'] for text in tokenized_test_texts], dim=0)
attention_mask_test = torch.cat([text['attention_mask'] for text in tokenized_test_texts], dim=0)



NameError: ignored

In [2]:
# Predict with the CNN model
cnn_model.eval()
with torch.no_grad():
    cnn_outputs = cnn_model(input_ids_test, attention_mask_test)
cnn_predictions = torch.argmax(cnn_outputs, dim=1)



NameError: ignored

In [None]:

# Convert CNN predictions to numpy array
cnn_predictions_np = cnn_predictions.numpy()



In [None]:
# Use the trained Random Forest model to predict
X_test_tfidf = tfidf_vectorizer.transform(X_test)
rf_predictions = rf_classifier.predict(X_test_tfidf)



In [None]:
# Perform majority voting ensemble
ensemble_predictions = []

for i in range(len(X_test)):
    # Use majority voting to decide the ensemble prediction
    if np.sum([cnn_predictions_np[i], rf_predictions[i]]) >= 2:
        ensemble_predictions.append(1)  # Predicted as "bad"
    else:
        ensemble_predictions.append(0)  # Predicted as "good"


In [None]:

# Calculate the accuracy of the ensemble model
ensemble_accuracy = accuracy_score(y_test_encoded, ensemble_predictions)
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.2f}")