In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# Binary Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

: 

In [None]:
df = pd.read_csv('dataset_phishing.csv')
df.head()

: 

In [None]:
class_map = {'legitimate':0, 'phishing':1}
df['status'] = df['status'].map(class_map)
df.head()

: 

In [None]:
def feature_selector_correlation(cmatrix, threshold):
    
    selected_features = []
    feature_score = []
    i=0
    for score in cmatrix:
        if abs(score)>threshold:
            selected_features.append(cmatrix.index[i])
            feature_score.append( ['{:3f}'.format(score)])
        i+=1
    result = list(zip(selected_features,feature_score)) 
    return result

: 

In [None]:
df = df.drop('url', axis=1)
corr_matrix = df.corr()
status_corr = corr_matrix['status']

features_selected = feature_selector_correlation(status_corr, 0.20)
print(features_selected)
selected_features = [i for (i,j) in features_selected if i != 'status']
selected_features

: 

In [None]:
features_requiring_additional_data = [
    'ip',
    'phish_hints',
    'nb_hyperlinks',
    'empty_title',
    'domain_age',
    'google_index',
    'page_rank'
]
selected_features = [feature for feature in selected_features if feature not in features_requiring_additional_data]
selected_features

: 

In [None]:
X_selected = df[selected_features]
X_selected

: 

In [None]:
y = df['status']
y

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    shuffle = True)

: 

In [None]:
model_random_forest = RandomForestClassifier(n_estimators=350,
                                             random_state=42,
                                            )
model_random_forest.fit(X_train,y_train)

: 

In [None]:
y_pred = model_random_forest.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

results = []
results.append({
    "Model": model_random_forest.__class__.__name__,
    "Accuracy": accuracy,
    "Precision (0)": report['0']['precision'],
    "Recall (0)": report['0']['recall'],
    "F1-score (0)": report['0']['f1-score'],
    "Precision (1)": report['1']['precision'],
    "Recall (1)": report['1']['recall'],
    "F1-score (1)": report['1']['f1-score']
})

sns.set(font_scale=1.2)
plt.figure(figsize=(10, 6))
table = sns.heatmap(pd.DataFrame(results).set_index('Model'), annot=True, cmap="plasma", fmt=".2f", linewidths=.5, cbar=False)
plt.title("Classification Report")
plt.show()

: 

In [None]:
from joblib import dump
dump(model_random_forest, 'best_model.joblib')

: 