# Preprocessing

In [1]:
#Importing the necessary packages:
import pandas as pd
import re

In [3]:
df = pd.read_csv("malicious_phish.csv")


In [3]:
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [4]:
df = df[(df["type"] == "benign") | (df["type"] == "phishing")]

In [36]:
print(f'Number of benign URLs: {len(df[df["type"] == "benign"])}')
print(f'Number of phishing URLs: {len(df[df["type"] == "phishing"])}')


Number of benign URLs: 428103
Number of phishing URLs: 94111


In [5]:
def url_split(url_name):
    
    UrlLength = len(url_name)
    NumDots = url_name.count(".")
    NumDash = url_name.count("-")    
    NumAtSymbol = url_name.count("@")
    NumTildeSymbol = url_name.count("~")
    NumUnderscore = url_name.count("_")
    NumPercent = url_name.count("%")    
    NumAmpersand = url_name.count("&")
    NumHash = url_name.count("#")
    NumNumericChars = len(re.findall(r'\d', url_name))


    if url_name.startswith("http://"):
        NoHttps = 1
        url_name = url_name[7:]
    elif url_name.startswith("https://"):
        NoHttps = 0
        url_name = url_name[8:]
    else:
        NoHttps = 0

    url_parts = url_name.split("/")
    
    subdomain_count = url_parts[0].count(".")
    SubdomainLevels = subdomain_count
    
    path_component = "/".join(url_parts[1:])
    path_segments = path_component.split("/")
    PathLevel = len(path_segments)
    
    NumDashInHostname = url_parts[0].count("-")

    query_part = url_parts[-1]
    query_segments = query_part.split("?")
    query_params = query_segments[1] if len(query_segments) > 1 else ""
    NumQueryComponents = len(query_params.split("&"))

    return {
        "UrlLength": UrlLength,
        "NumDots": NumDots,
        "NumDash": NumDash,
        "NumAtSymbol": NumAtSymbol,
        "NumTildeSymbol": NumTildeSymbol,
        "NumUnderscore": NumUnderscore,
        "NumPercent": NumPercent,
        "NumAmpersand": NumAmpersand,
        "NumHash": NumHash,
        "NumNumericChars": NumNumericChars,
        "NoHttps": NoHttps,
        "PathLevel": PathLevel,
        "SubdomainLevels": SubdomainLevels,
        "NumDashInHostname": NumDashInHostname,
        "NumQueryComponents": NumQueryComponents
    }

# Example usage
url_http = "http://br-icloud.com.br/subdomain/page?param1=value1&param2=value2"
url_https = "https://example.com/path/to/resource"
url_no_http_https = "example.com/resource"
result_http = url_split(url_http)
result_https = url_split(url_https)
result_no_http_https = url_split(url_no_http_https)
print(result_http)
print(result_https)
print(result_no_http_https)




{'UrlLength': 66, 'NumDots': 2, 'NumDash': 1, 'NumAtSymbol': 0, 'NumTildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumAmpersand': 1, 'NumHash': 0, 'NumNumericChars': 4, 'NoHttps': 1, 'PathLevel': 2, 'SubdomainLevels': 2, 'NumDashInHostname': 1, 'NumQueryComponents': 2}
{'UrlLength': 36, 'NumDots': 1, 'NumDash': 0, 'NumAtSymbol': 0, 'NumTildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'PathLevel': 3, 'SubdomainLevels': 1, 'NumDashInHostname': 0, 'NumQueryComponents': 1}
{'UrlLength': 20, 'NumDots': 1, 'NumDash': 0, 'NumAtSymbol': 0, 'NumTildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'PathLevel': 1, 'SubdomainLevels': 1, 'NumDashInHostname': 0, 'NumQueryComponents': 1}


In [6]:
df[["UrlLength", "NumDots", "NumDash", "NumAtSymbol", "NumTildeSymbol", "NumUnderscore", "NumPercent",
    "NumAmpersand", "NumHash", "NumNumericChars", "NoHttps", "PathLevel", "SubdomainLevels",
    "NumDashInHostname", "NumQueryComponents"]] = df["url"].apply(url_split).apply(pd.Series)

In [7]:
df.head()

Unnamed: 0,url,type,UrlLength,NumDots,NumDash,NumAtSymbol,NumTildeSymbol,NumUnderscore,NumPercent,NumAmpersand,NumHash,NumNumericChars,NoHttps,PathLevel,SubdomainLevels,NumDashInHostname,NumQueryComponents
0,br-icloud.com.br,phishing,16,2,1,0,0,0,0,0,0,0,0,1,2,1,1
1,mp3raid.com/music/krizz_kaliko.html,benign,35,2,0,0,0,1,0,0,0,1,0,2,1,0,1
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,2,0,0,0,0,0,0,0,1,0,3,1,0,1
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign,118,2,16,0,0,0,0,0,0,1,1,3,1,0,1
6,espn.go.com/nba/player/_/id/3457/brandon-rush,benign,45,2,1,0,0,1,0,0,0,4,0,6,2,0,1


In [2]:
#df.to_csv("data_preprocessed.csv", index = False)
df = pd.read_csv("data_preprocessed.csv")


# Building the models

## Random Forest

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

### Baseline RF Model

In [4]:
X = df.drop(columns=["type", "url"])
y = df["type"]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf_baseline = RandomForestClassifier(n_estimators=100, random_state=42)

In [5]:
clf_baseline.fit(X_train, y_train)
y_pred_baseline = clf_baseline.predict(X_test)

In [6]:
accuracy = accuracy_score(y_test, y_pred_baseline)
classification_report_str = classification_report(y_test, y_pred_baseline)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report_str)

Accuracy: 0.8977719904636979
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94     85611
           1       0.79      0.60      0.68     18832

    accuracy                           0.90    104443
   macro avg       0.85      0.78      0.81    104443
weighted avg       0.89      0.90      0.89    104443



In [7]:
confusion_matrix(y_test, y_pred_baseline)

array([[82553,  3058],
       [ 7619, 11213]], dtype=int64)

### SMOTE Sampling RF Model

In [8]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

clf_smote = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

In [9]:
clf_smote.fit(X_train_resampled, y_train_resampled)
y_pred_smote = clf_smote.predict(X_test)

In [10]:
accuracy = accuracy_score(y_test, y_pred_smote)
classification_report_str = classification_report(y_test, y_pred_smote)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report_str)

Accuracy: 0.8388786227894641
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.84      0.89     85611
           1       0.53      0.86      0.66     18832

    accuracy                           0.84    104443
   macro avg       0.75      0.85      0.78    104443
weighted avg       0.89      0.84      0.85    104443



In [11]:
confusion_matrix(y_test, y_pred_smote)

array([[71501, 14110],
       [ 2718, 16114]], dtype=int64)

### Hyperparameter-Tuned RF Model

In [14]:
param_grid = {
    'n_estimators': [100],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
scoring = 'accuracy'

In [13]:
#Takes long to run, will run on final code
#grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
#                           param_grid=param_grid,
#                           scoring=scoring,
#                           cv=5,
#                           n_jobs=-1)

#grid_search.fit(X_train, y_train)


KeyboardInterrupt: 

In [None]:
#best_model = grid_search.best_estimator_
#best_params = grid_search.best_params_

#y_val_pred = best_model.predict(X_test)
#validation_accuracy = accuracy_score(y_test, y_val_pred)

In [15]:
from sklearn.model_selection import RandomizedSearchCV

randomized_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=5,
    scoring=scoring,
    cv=5,
    n_jobs=-1
)

randomized_search.fit(X_train, y_train)

best_model = randomized_search.best_estimator_
best_params = randomized_search.best_params_

y_pred_best = best_model.predict(X_test)
validation_accuracy = accuracy_score(y_test, y_pred_best)

In [17]:
print(validation_accuracy)
confusion_matrix(y_test, y_pred_best)

0.8990166885286711


array([[83004,  2607],
       [ 7940, 10892]], dtype=int64)

## Neural Network

### Baseline NN model

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from tensorflow import keras
from tensorflow.keras import layers

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2964c176590>

In [19]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.892745316028595


In [20]:
y_nn_pred = (model.predict(X_test) > 0.5).astype(int)
confusion_matrix(y_test, y_nn_pred)



array([[82761,  2850],
       [ 8352, 10480]], dtype=int64)

### SMOTE NN Model

In [20]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Create a neural network model
model_smote = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model_smote.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_smote.fit(X_train_smote, y_train_smote, epochs=10, batch_size=32, validation_split=0.2)

test_loss_smote, test_accuracy_smote = model_smote.evaluate(X_test, y_test)
print(f"Test Accuracy (SMOTE Model): {test_accuracy_smote}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy (SMOTE Model): 0.8522256016731262
