In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder



In [5]:
# Step 1: Data Preparation
data = pd.read_csv('phishing_site_urls.csv')
X = data['URL']  # URL text data
y = data['Label']  # Binary labels (bad or good)


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Step 2: Ensemble Learning
# Convert text data into numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [10]:
# Train a Random Forest classifier (you can replace with SVM if needed)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

In [22]:
# Convert string labels to numerical labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
# Convert string labels to numerical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)


In [15]:
# Make predictions using the trained classifier
rf_predictions = rf_classifier.predict(X_test_tfidf)



In [16]:
# Calculate accuracy using numerical labels
rf_accuracy = accuracy_score(y_test_encoded, rf_predictions)
print(f"Random Forest Classifier Accuracy: {rf_accuracy:.2f}")

Random Forest Classifier Accuracy: 0.00


In [17]:
# Step 3: Feature Extraction with BERT
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [18]:
# Define a maximum sequence length (adjust as needed)
max_seq_length = 128


In [19]:
# Ensure X_train is a list of strings
X_train = X_train.astype(str).tolist()

In [None]:
# Tokenize and convert text data to BERT embeddings
tokenized_texts = [tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in X_train]
input_ids = torch.cat([text['input_ids'] for text in tokenized_texts], dim=0)
attention_mask = torch.cat([text['attention_mask'] for text in tokenized_texts], dim=0)
labels = torch.tensor(y_train_encoded)  # Use the encoded labels

In [24]:
# Create DataLoader for training
dataset = TensorDataset(input_ids, attention_mask, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [25]:
# Define a simple CNN model for fine-tuning with BERT features
class CNNWithBERT(nn.Module):
    def __init__(self):
        super(CNNWithBERT, self).__init__()
        self.bert = bert_model
        self.conv1 = nn.Conv1d(in_channels=768, out_channels=128, kernel_size=3)
        self.fc1 = nn.Linear(128, 2)  # 2 output classes


In [26]:
def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        x = self.conv1(pooled_output.permute(0, 2, 1))
        x = torch.max(x, 2)[0]
        x = self.fc1(x)
        return x

In [27]:
# Fine-tune the CNN model using BERT features
cnn_model = CNNWithBERT()

In [28]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=1e-3)


In [29]:
# Train the CNN model
num_epochs = 5
for epoch in range(num_epochs):
    cnn_model.train()
    for batch in dataloader:
        input_ids_batch, attention_mask_batch, labels_batch = batch
        optimizer.zero_grad()
        outputs = cnn_model(input_ids_batch, attention_mask_batch)
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()

NotImplementedError: ignored

In [None]:
# Step 4: Ensemble Integration
# Use the trained Random Forest and CNN models for ensemble
cnn_model.eval()
with torch.no_grad():
    tokenized_texts_test = [tokenizer(text, padding=True, truncation=True, return_tensors='pt') for text in X_test]
    input_ids_test = torch.cat([text['input_ids'] for text in tokenized_texts_test], dim=0)
    attention_mask_test = torch.cat([text['attention_mask'] for text in tokenized_texts_test], dim=0)


In [None]:
# Get CNN model predictions
    cnn_outputs = cnn_model(input_ids_test, attention_mask_test)
    cnn_predictions = torch.argmax(cnn_outputs, dim=1)

In [None]:
# Combine the predictions from Random Forest and CNN
ensemble_predictions = []
for rf_pred, cnn_pred in zip(rf_predictions, cnn_predictions):
    # You can define your own logic for combining the predictions (e.g., voting)
    # Here, we take the CNN prediction if it's "bad" (class 0), otherwise use RF prediction
    ensemble_pred = cnn_pred.item() if cnn_pred == 0 else rf_pred
    ensemble_predictions.append(ensemble_pred)

In [None]:
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
print(f"Ensemble Accuracy: {ensemble_accuracy:.2f}")