In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import mannwhitneyu
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import sklearn.metrics as metrics
from scipy.stats import randint, uniform
import pickle
from xgboost import XGBClassifier

Loading Dataset


In [None]:
df = pd.read_csv('dataset_phishing.csv')

Data cleaning and Preprocessing

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns


In [None]:
df['status'].value_counts()

In [None]:
#change status into int dtype with legitimate as 0 and phishing as 1
mapping = {'legitimate':0, 'phishing':1}

df['status'] = df['status'].map(mapping)

In [None]:
df['status'].value_counts()

In [None]:
corr_matrix = df.corr(numeric_only=True)
corr_matrix

In [None]:
target_corr = corr_matrix['status']
target_corr

In [None]:
#only choose features with abs value > 0.1

threshold=0.1
relevant_features = target_corr[abs(target_corr)>threshold].index.tolist()
relevant_features

In [None]:
#only select relevant features

X = df[relevant_features]
X = X.drop('status', axis=1)
y = df['status']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
correlation_matrix = df[relevant_features].corr(numeric_only=True)
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Step 1: Normality Analysis
# Loop through each relevant feature, plotting histogram and Q-Q plot
for column in relevant_features:
    plt.figure(figsize=(10, 4))

    # Histogram
    plt.subplot(1, 2, 1)
    sns.histplot(df[column], kde=True)
    plt.title(f'Histogram of {column}')

    # Q-Q plot
    plt.subplot(1, 2, 2)
    stats.probplot(df[column].dropna(), dist="norm", plot=plt)
    plt.title(f'Q-Q Plot of {column}')

    plt.tight_layout()
    plt.show()

# Statistical test for normality using Shapiro-Wilk for each feature
for column in relevant_features:
    stat, p = stats.shapiro(df[column].dropna())
    print(f'{column}: Statistics={stat:.3f}, p={p:.3f}')
    if p > 0.05:
        print(f'{column} appears Gaussian (fail to reject H0)')
    else:
        print(f'{column} does NOT appear Gaussian (reject H0)')

In [None]:
# Step 2: Correlation Analysis
# Calculate and visualize the correlation matrix for relevant features
plt.figure(figsize=(12, 8))
correlation_matrix = df[relevant_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Relevant Features")
plt.show()

In [None]:
# Step 3: Statistical Testing - Comparing Phishing vs. Legitimate Sites
for column in relevant_features:
    phishing_data = df[df['status'] == 1][column].dropna()
    legitimate_data = df[df['status'] == 0][column].dropna()

    # Mann-Whitney U test (non-parametric) given possible non-normality
    stat, p = mannwhitneyu(phishing_data, legitimate_data)
    print(f'{column}: Mann-Whitney U Test Statistics={stat:.3f}, p={p:.3f}')
    if p < 0.05:
        print(f"Significant difference in {column} between phishing and legitimate (reject H0)")
    else:
        print(f"No significant difference in {column} (fail to reject H0)")

In [None]:
# Step 4: Hypothesis Testing Example (for specific feature comparisons)
# Example Hypothesis: Feature 'length_hostname' has a higher median in phishing sites
feature = 'length_hostname'

# Calculate medians in phishing vs. legitimate
phishing_median = df[df['status'] == 1][feature].median()
legitimate_median = df[df['status'] == 0][feature].median()

print(f'Median of {feature} in phishing sites: {phishing_median}')
print(f'Median of {feature} in legitimate sites: {legitimate_median}')

# Mann-Whitney U Test with 'greater' alternative hypothesis
stat, p = mannwhitneyu(phishing_data, legitimate_data, alternative='greater')
print(f'Mann-Whitney U Test for {feature}: Statistics={stat:.3f}, p={p:.3f}')
if p < 0.05:
    print(f"Hypothesis supported: {feature} median is significantly higher in phishing sites.")
else:
    print(f"Hypothesis not supported: No significant median difference for {feature}.")

Model Developmet

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_scaled,y_train)

In [None]:
rf_predict = rf.predict(X_test_scaled)

In [None]:
accuracy = accuracy_score(y_test,rf_predict)
print("Accuracy:{}%".format(round(accuracy *100), 1))

In [None]:
rf_Accuracy_Score = accuracy_score(y_test,rf_predict)
rf_JaccardIndex = jaccard_score(y_test,rf_predict)
rf_F1_Score = f1_score(y_test,rf_predict)
rf_Log_Loss = log_loss(y_test,rf_predict)

In [None]:
print(f"Accuracy: {rf_Accuracy_Score}")
print(f"Jaccard Index: {rf_JaccardIndex}")
print(f"F1 Score: {rf_F1_Score}")
print(f"Log Loss: {rf_Log_Loss}")

In [None]:
rf_conf_matrix = confusion_matrix(y_test,rf_predict)
rf_conf_matrix

In [None]:
sns.heatmap(rf_conf_matrix,annot=True, fmt = 'd',cmap='Greens')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
rf_report = classification_report(y_test,rf_predict)
print(rf_report)

SVM

In [None]:
svm = SVC()
svm.fit(X_train_scaled,y_train)

In [None]:
svm_predict = svm.predict(X_test_scaled)

In [None]:
accuracy = accuracy_score(y_test,svm_predict)
print("Accuracy:{}%".format(round(accuracy *100), 1))

In [None]:
svm_Accuracy_Score = accuracy_score(y_test,svm_predict)
svm_JaccardIndex = jaccard_score(y_test,svm_predict)
svm_F1_Score = f1_score(y_test,svm_predict)
svm_Log_Loss = log_loss(y_test,svm_predict)


In [None]:
print(f"Accuracy: {svm_Accuracy_Score}")
print(f"Jaccard Index: {svm_JaccardIndex}")
print(f"F1 Score: {svm_F1_Score}")
print(f"Log Loss: {svm_Log_Loss}")


In [None]:
svm_conf_matrix = confusion_matrix(y_test,svm_predict)
svm_conf_matrix

In [None]:
sns.heatmap(svm_conf_matrix,annot=True, fmt = 'd',cmap='Greens')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

XG Boost

In [None]:
params = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 1,
    'gamma': 0,
    'objective': 'binary:logistic'
}

In [None]:
xgb = XGBClassifier(**params)
xgb.fit(X_train_scaled,y_train)

In [None]:
xgb_predict = xgb.predict(X_test_scaled)

In [None]:
accuracy = accuracy_score(y_test,xgb_predict)
print("Accuracy:{}%".format(round(accuracy *100), 1))

In [None]:
xgb_Accuracy_Score = accuracy_score(y_test,xgb_predict)
xgb_JaccardIndex = jaccard_score(y_test,xgb_predict)
xgb_F1_Score = f1_score(y_test,xgb_predict)
xgb_Log_Loss = log_loss(y_test,xgb_predict)

In [None]:
print(f"Accuracy: {xgb_Accuracy_Score}")
print(f"Jaccard Index: {xgb_JaccardIndex}")
print(f"F1 Score: {xgb_F1_Score}")
print(f"Log Loss: {xgb_Log_Loss}")

In [None]:
xgb_conf_matrix = confusion_matrix(y_test,xgb_predict)
xgb_conf_matrix

In [None]:
sns.heatmap(xgb_conf_matrix,annot=True, fmt = 'd',cmap='Greens')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

Model Accuracy

In [None]:
accuracy_scores = {
    'Random Forest' : round(accuracy_score(y_test,rf_predict), 2),
    'SVM' : round(accuracy_score(y_test,svm_predict), 2),
    'XGBoost' : round(accuracy_score(y_test,xgb_predict), 2)
}

In [None]:
accuracy_df = pd.DataFrame(list(accuracy_scores.items()), columns=['Model', 'Accuracy Score'])
accuracy_df

Model Deployment

In [None]:
#saving the best performing model

filename = 'phishing_detection_model.sav'
pickle.dump(rf, open(filename, 'wb'))