In [1]:
import pandas as pd 

In [2]:
main = pd.read_csv("malicious_phish.csv")
main

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [None]:
import pandas as pd

# Define a list of file paths and corresponding types
file_paths = [
    ("FinalDataset/URL/Benign_list_big_final.csv", "benign"),
    ("FinalDataset/URL/DefacementSitesURLFiltered.csv", "defacement"),
    ("FinalDataset/URL/Malware_dataset.csv", "malware"),
    ("FinalDataset/URL/phishing_dataset.csv", "phishing"),
    ("FinalDataset/URL/spam_dataset.csv", "spam")
]

# Initialize an empty list to store dataframes
dfs = []

# Iterate through the file paths and types
for file_path, url_type in file_paths:
    # Read the CSV file
    df = pd.read_csv(file_path, header=None)
    # Assign column names
    df.columns = ["url"]
    # Add a column for the URL type
    df["type"] = url_type
    # Append the dataframe to the list
    dfs.append(df)

dfs.append(main)
# Concatenate the list of dataframes into a single dataframe
merged_df = pd.concat(dfs, ignore_index=True)
merged_df.drop_duplicates(inplace=True)
merged_df.reset_index(drop=True, inplace=True)

# Display the merged dataframe
merged_df.head(100000)

Unnamed: 0,url,type
0,http://1337x.to/torrent/1048648/American-Snipe...,benign
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,benign
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,benign
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,benign
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,benign
...,...,...
99995,http://www.ccent.com.au/index.php?view=article...,defacement
99996,http://www.ccent.com.au/index.php?option=com_c...,defacement
99997,http://www.ccent.com.au/index.php?option=com_m...,defacement
99998,http://www.ccent.com.au/index.php?view=article...,defacement


Preprocessing

In [None]:
# Check for missing values
print("Missing values before cleaning:")
print(merged_df.isnull().sum())

# Drop rows with missing values (if any)
merged_df = merged_df.dropna()

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
merged_df['label'] = le.fit_transform(merged_df['type'])

# Clean URLs
import re
def clean_url(url):
    url = re.sub(r'^https?://', '', url)  # Remove protocol
    url = re.sub(r'www\d*\.', '', url)    # Remove www
    url = url.split('/')[0]               # Keep domain
    return url

merged_df['clean_url'] = merged_df['url'].apply(clean_url)

Class Distribution & Balancing

In [None]:
# Plot class distribution
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
merged_df['type'].value_counts().plot(kind='bar')
plt.title("Class Distribution Before Balancing")
plt.savefig('class_distribution_before.png')
plt.show()

# Balance using SMOTE
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization (required for SMOTE on text data)
tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(merged_df['clean_url'])
y = merged_df['label']

# Split data first to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, stratify=y)

# Apply SMOTE only on the training set
smote = SMOTE()
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Plot balanced distribution
plt.figure(figsize=(10, 6))
pd.Series(y_train_bal).value_counts().plot(kind='bar')
plt.title("Class Distribution After Balancing")
plt.savefig('class_distribution_after.png')
plt.show()

 Feature Extraction

In [None]:
# Structural Features
def extract_structural_features(url):
    return {
        'length': len(url),
        'num_dots': url.count('.'),
        'num_hyphens': url.count('-'),
        'num_slash': url.count('/'),
        'has_query': 1 if '?' in url else 0
    }

structural_features = merged_df['clean_url'].apply(extract_structural_features)
structural_df = pd.DataFrame(structural_features.tolist())

# Combine with TF-IDF
import numpy as np
from scipy.sparse import hstack

X_combined = hstack([X_tfidf, structural_df.values])

Model Implementation

In [None]:
# Split combined features
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, stratify=y)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred)}")

# XGBoost
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print(f"XGBoost Accuracy: {accuracy_score(y_test, xgb_pred)}")

# LSTM (simplified example)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(merged_df['clean_url'])
sequences = tokenizer.texts_to_sequences(merged_df['clean_url'])
X_seq = pad_sequences(sequences, maxlen=100)

# Train LSTM (use GPU if available)
from tensorflow.keras.models import Sequential
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64),
    LSTM(64),
    Dense(5, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_seq, merged_df['label'], epochs=5, validation_split=0.2)

Visualization & Report

In [None]:
# Confusion Matrix
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(rf, X_test, y_test)
plt.title("Random Forest Confusion Matrix")
plt.savefig('confusion_matrix_rf.png')

# Feature Importance
plt.figure(figsize=(10, 6))
plt.bar(range(len(rf.feature_importances_)), rf.feature_importances_)
plt.title("Feature Importance (Random Forest)")
plt.savefig('feature_importance.png')

In [None]:
LLM-Based Model (BERT) Integration


LLM-Based Model (BERT) Integration


In [None]:
# BERT Fine-Tuning (LLM)
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Tokenize URLs
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(
    merged_df['clean_url'].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='tf'
)

# Prepare labels
labels = tf.convert_to_tensor(merged_df['label'])

# Split into train-test
X_train, X_test, y_train, y_test = train_test_split(
    inputs['input_ids'], labels, test_size=0.2, stratify=labels
)

# Load BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Compile and train
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=2, batch_size=16, validation_split=0.1)

# Evaluate
bert_pred = model.predict(X_test).logits.argmax(axis=1)
print(f"BERT Accuracy: {accuracy_score(y_test, bert_pred)}")