In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

# Training data
training_data = pd.read_csv("training.csv")

# Converting phishing to 1 and legitimate to 0 for training data
training_data['status'] = training_data['status'].map({'phishing': 1, 'legitimate': 0})
print("Before Processing:",training_data.shape)

# Preprocessing
training_data = training_data.drop('url', axis=1)
training_data = training_data.fillna(0)

# So the Top features contributing to the model are taken into consideration and the rest are ignored
# Top 61 features contributing to the model are considered
training_data = training_data[['google_index', 'page_rank', 'nb_hyperlinks', 'web_traffic', 'domain_age', 'nb_www', 'safe_anchor', 'length_url', 'ratio_digits_url', 'shortest_word_host', 'domain_registration_length', 'longest_words_raw', 'phish_hints', 'length_hostname', 'char_repeat', 'shortest_word_path', 'nb_slash', 'domain_in_title', 'shortest_words_raw', 'nb_dots', 'ratio_digits_host', 'longest_word_host', 'nb_hyphens', 'ip', 'nb_qm', 'nb_subdomains', 'domain_with_copyright', 'nb_redirection', 'domain_in_brand', 'https_token', 'nb_underscore', 'prefix_suffix', 'shortening_service', 'nb_and', 'nb_com', 'nb_percent', 'suspecious_tld', 'whois_registered_domain', 'random_domain', 'statistical_report', 'tld_in_subdomain', 'dns_record', 'nb_space', 'tld_in_path', 'nb_at', 'nb_colon', 'http_in_path', 'nb_semicolumn', 'nb_dslash', 'nb_tilde', 'brand_in_path', 'port', 'brand_in_subdomain', 'nb_comma', 'nb_external_redirection', 'path_extension', 'punycode', 'nb_dollar', 'nb_star', 'nb_or', 'status']]

print("After Processing:", training_data.shape)

# Split the data into x = features and y = label
x_train = training_data.iloc[:, 0:-1]
y_train = training_data.iloc[:, -1]

# Model
clf = RandomForestClassifier(n_estimators=100, random_state=0)

# Model training
clf.fit(x_train, y_train)
# Model is trained then test the model

# Testing data same process as training data
testing_data = pd.read_csv("testing.csv")
testing_data['status'] = testing_data['status'].map({'phishing': 1, 'legitimate': 0})
testing_data = testing_data.drop('url', axis=1)
testing_data = testing_data.fillna(0)
testing_data = testing_data[['google_index', 'page_rank', 'nb_hyperlinks', 'web_traffic', 'domain_age', 'nb_www', 'safe_anchor', 'length_url', 'ratio_digits_url', 'shortest_word_host', 'domain_registration_length', 'longest_words_raw', 'phish_hints', 'length_hostname', 'char_repeat', 'shortest_word_path', 'nb_slash', 'domain_in_title', 'shortest_words_raw', 'nb_dots', 'ratio_digits_host', 'longest_word_host', 'nb_hyphens', 'ip', 'nb_qm', 'nb_subdomains', 'domain_with_copyright', 'nb_redirection', 'domain_in_brand', 'https_token', 'nb_underscore', 'prefix_suffix', 'shortening_service', 'nb_and', 'nb_com', 'nb_percent', 'suspecious_tld', 'whois_registered_domain', 'random_domain', 'statistical_report', 'tld_in_subdomain', 'dns_record', 'nb_space', 'tld_in_path', 'nb_at', 'nb_colon', 'http_in_path', 'nb_semicolumn', 'nb_dslash', 'nb_tilde', 'brand_in_path', 'port', 'brand_in_subdomain', 'nb_comma', 'nb_external_redirection', 'path_extension', 'punycode', 'nb_dollar', 'nb_star', 'nb_or', 'status']]

# Testing data
x_test = testing_data.iloc[:, 0:-1]
y_test = testing_data.iloc[:, -1]

# Predicting the model and checking with the actual values
y_pred = clf.predict(x_test)

# Final Report
print()
print("Final Report:")
print("Confusion_matrix: ")
print(confusion_matrix(y_test, y_pred))

print()
print("Classification_report: ")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
accuracy = 100 * accuracy
print("Accuracy: ", round(accuracy, 8), "%")

precision = precision_score(y_test, y_pred)
precision = 100 * precision
print("Precision: ", round(precision, 8), "%")

recall = recall_score(y_test, y_pred)
recall = 100 * recall
print("Recall: ", round(recall, 8), "%")

f1 = f1_score(y_test, y_pred)
f1 = 100 * f1
print("F1 Score: ", round(f1, 8), "%")




# My Observations on the given dataset
# Features with high correlation are more linearly dependent and hence have almost the same effect on the dependent variable. So, when two features have high correlation, we can drop one of the two features
# Features with low correlation are less linearly dependent and hence have almost no effect on the dependent variable. So, when two features have low correlation, we can drop both of the two features

# if length of the hostname is more than 21 then there is a more possible that it is in the phising class
# nb_semicolon, nb_eq_bins, nb_underscore, nb_percent, nb_ratio_digits_hosts do not contribute to the classification much as the phising and legitimate contribution are almost equal
# if nb_dots > 3 , nb_slash > 4, rtio_digits_url > 0.0221, longest_words_raw > 12, longest_words_path > 14, avg_word_host > 9, avg_word_path > 6 then there is large possibility that it is in the phishing class.
# nb_semicolon, nb_eq_bins, nb_underscore, nb_percent, nb_ratio_digits_hosts do not contribute as much as the phising and legitimate contribution are almost equal.

# By Taking the coorelation between the features 
# avg_words_raw and longest_words_raw are highly coorelated with each other.. so we can drop one of them
# abnormal_subdomain and ratio_digits_host are positively coorelated with each other.. so we can drop one of them
# avg_words_path correlated with longest_words_host, longest_word_path, avg_words_raw.. so we can drop one of them
# nb_eq is highly coorelated with nb_qm , nb_and and length_words_raw.. so we can drop nb_eq
# avg_words_host, shortest_word_host, longest_word_host are highly coorelated.. so we can drop avg_words_host

# When the google index is 0, Legitimate classs are more but when the Phising class holds true when google index is 1. 
# When we have dns_record value as 1, mostly it corresponds to phishing class.
# When domain_age is high then it is more of a legitimate class as phishing class.
# if nb_hyperlinks is high, then it is probably a factor for suspecting as phishing class
# ratio_extredirection and ext_error ratio are positively coorelated with phishing

# After some trial's , I found the features domain_with_copyright, nb_hyperlinks, safe_anchor is useful for getting high accuracy.
# By taking the coorelation, I have removed the features which are not contributing much to the classification
# By all the above methods, I have reduced the number of features from 89 to 61
# So only 61 features are contributing to the classification of the dataset for high accuracy

# Finally only 61 features are used for training the model and the rest of the features are ignored

Before Processing: (7134, 89)
After Processing: (7134, 61)

Final Report:
Confusion_matrix: 
[[937  41]
 [ 35 996]]

Classification_report: 
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       978
           1       0.96      0.97      0.96      1031

    accuracy                           0.96      2009
   macro avg       0.96      0.96      0.96      2009
weighted avg       0.96      0.96      0.96      2009

Accuracy:  96.21702339 %
Precision:  96.04628737 %
Recall:  96.60523763 %
F1 Score:  96.32495164 %
