In [62]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [63]:
data = pd.read_csv(r"datav5.csv")
data.head()

Unnamed: 0,url,label,url_length,domain_length,path_length,is_@,is_?,num-,num=,num.,...,is_port,digits_count,alpha_count,special_chars_count,is_host,num_params,num_fragments,num_sub_domains,is_.ai,num/
0,http://124.235.169.36:47690/Mozi.m,1,34,20,6,0,0,0,0,4,...,1,16,9,9,1,0,0,1,0,3
1,https://www.reggaekiwami.com/rockdesire/,0,40,20,11,0,0,0,0,2,...,0,0,33,7,1,0,0,2,0,4
2,http://www.kf25zx.com/images/?http://us.battle...,1,59,14,37,0,1,0,0,4,...,0,2,42,15,1,0,0,2,0,8
3,http://185.216.70.79/arc,1,24,13,3,0,0,0,0,3,...,0,10,7,7,1,0,0,1,0,3
4,https://www.wc.rootsweb.ancestry.com/cgi-bin/i...,0,87,28,50,0,1,1,3,5,...,0,5,63,19,1,2,0,2,0,4


In [3]:
numerical_cols = data.select_dtypes(include=["number"]).columns
X = data[numerical_cols]
y = X["label"]
X = X.drop(columns="label")

In [4]:
def get_numerical_non_binary_columns(X):
    numerical_cols = X.select_dtypes(include=["number"]).columns
    non_binary_numerical_cols = [col for col in numerical_cols if X[col].nunique() > 2]
    return non_binary_numerical_cols


non_binary_numerical_columns = get_numerical_non_binary_columns(X)
print(non_binary_numerical_columns)

['url_length', 'domain_length', 'path_length', 'num-', 'num=', 'num.', 'num%', 'num//', 'num_', 'num_subdomains', 'entropy', 'count_num_domain_extension', 'digits_count', 'alpha_count', 'special_chars_count', 'num_params', 'num_fragments', 'num_sub_domains', 'num/']


In [5]:
scaler = StandardScaler()
X.loc[:, non_binary_numerical_columns] = scaler.fit_transform(X[non_binary_numerical_columns])
X.head()

Unnamed: 0,url_length,domain_length,path_length,is_@,is_?,num-,num=,num.,is_#,num%,...,is_port,digits_count,alpha_count,special_chars_count,is_host,num_params,num_fragments,num_sub_domains,is_.ai,num/
0,-0.425428,0.341955,-0.374005,0,0,-0.455319,-0.310348,1.164841,0,-0.074956,...,1,0.878848,-0.768835,-0.065346,1,-0.229098,-0.011886,-0.727021,0,-0.267287
1,-0.308,0.341955,-0.273409,0,0,-0.455319,-0.310348,-0.351969,0,-0.074956,...,0,-0.547404,-0.177023,-0.378423,1,-0.229098,-0.011886,-0.071804,0,0.303914
2,0.063856,-0.157791,0.249691,0,1,-0.455319,-0.310348,1.164841,0,-0.074956,...,0,-0.369122,0.044907,0.873886,1,-0.229098,-0.011886,-0.071804,0,2.58872
3,-0.621142,-0.241082,-0.434363,0,0,-0.455319,-0.310348,0.406436,0,-0.074956,...,0,0.344003,-0.818153,-0.378423,1,-0.229098,-0.011886,-0.727021,0,-0.267287
4,0.611853,1.008282,0.511241,0,1,-0.066802,2.173423,1.923246,0,-0.074956,...,0,-0.1017,0.562742,1.50004,1,1.824971,-0.011886,-0.071804,0,0.303914


In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
feature_importances = model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

importance_df.head(43)

Unnamed: 0,Feature,Importance
28,count_num_domain_extension,0.308976
16,is_www.,0.16244
13,is_https,0.138768
34,is_host,0.131848
11,num//,0.068352
39,num/,0.028304
5,num-,0.018626
2,path_length,0.012604
0,url_length,0.011054
9,num%,0.010732


In [17]:
corr_matrix = data.drop(columns="url").corr()
target_corr = corr_matrix["label"].drop("label")
sorted_corr = target_corr.abs().sort_values(ascending=False)

top_features = sorted_corr.index
print(sorted_corr)

count_num_domain_extension    0.541272
is_port                       0.465484
is_.com                       0.426005
is_host                       0.416560
is_http                       0.415185
num//                         0.414479
is_pre_domain                 0.383356
contains_IPv4                 0.382505
IP_exist                      0.382011
is_https                      0.330086
is_www.                       0.274485
num_sub_domains               0.233720
num-                          0.214403
num.                          0.177736
digits_count                  0.158015
domain_length                 0.144193
num_                          0.125122
is_.org                       0.123621
num_params                    0.123569
is_+                          0.117629
entropy                       0.113739
alpha_count                   0.105002
is_&                          0.098855
num=                          0.095835
num_subdomains                0.095500
contains_hexadecimal     

In [58]:
model = DecisionTreeClassifier()
rfe = RFE(model, n_features_to_select=35)
fit = rfe.fit(X, y)

selected_features = X.columns[fit.support_]
pd.DataFrame(selected_features, columns=['Selected Features']).head(35)

Unnamed: 0,Selected Features
0,url_length
1,domain_length
2,path_length
3,is_@
4,is_?
5,num-
6,num=
7,num.
8,is_#
9,num%


In [31]:
columns_in_df1_not_df2 = list(set(X.columns) - set(df_reduced.columns))
print(columns_in_df1_not_df2)

['path_length', 'IP_exist', 'is_http', 'is_host', 'alpha_count', 'num_params', 'num/', 'is_port', 'is_pre_domain', 'is_&']


## **VarianceThreshold**
Heigh variance means more information, so it remove lower variance features.

In [36]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.1)
df_high_variance = X.loc[:, selector.fit(X).get_support()]

print(df_high_variance.columns)

Index(['url_length', 'domain_length', 'path_length', 'is_?', 'num-', 'num=',
       'num.', 'num%', 'num//', 'is_http', 'is_https', 'is_.com', 'is_www.',
       'num_', 'num_subdomains', 'entropy', 'count_num_domain_extension',
       'is_port', 'digits_count', 'alpha_count', 'special_chars_count',
       'is_host', 'num_params', 'num_fragments', 'num_sub_domains', 'num/'],
      dtype='object')


In [37]:
excluded_features = X.columns.difference(df_high_variance.columns)
print(excluded_features)

Index(['IP_exist', 'contains_IPv4', 'contains_IPv6', 'contains_hexadecimal',
       'is_#', 'is_&', 'is_+', 'is_.ai', 'is_.org', 'is_;', 'is_@', 'is_ftp/',
       'is_pre_domain', 'is_~'],
      dtype='object')


## **Univariate**
ANOVA F-value between label/feature.

In [51]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=35)
X_new = selector.fit_transform(X, y)

univariate = pd.DataFrame(X_new, columns=X.columns[selector.get_support()])

In [52]:
excluded_features = X.columns.difference(univariate.columns)
print(excluded_features)

Index(['is_.ai', 'is_?', 'is_ftp/', 'is_~', 'special_chars_count'], dtype='object')


In [66]:
data.columns

Index(['url', 'label', 'url_length', 'domain_length', 'path_length', 'is_@',
       'is_?', 'num-', 'num=', 'num.', 'is_#', 'num%', 'is_+', 'num//',
       'is_http', 'is_https', 'is_ftp/', 'is_.com', 'is_www.', 'is_.org',
       'is_&', 'is_;', 'num_', 'is_~', 'contains_IPv4', 'contains_IPv6',
       'IP_exist', 'num_subdomains', 'contains_hexadecimal', 'entropy',
       'count_num_domain_extension', 'is_pre_domain', 'is_port',
       'digits_count', 'alpha_count', 'special_chars_count', 'is_host',
       'num_params', 'num_fragments', 'num_sub_domains', 'is_.ai', 'num/'],
      dtype='object')

In [67]:
# Drop specified columns
data = data.drop(columns=["is_pre_domain", "is_ftp/", "num_fragments", "is_.ai", "contains_IPv6"])

# Save the modified DataFrame to a CSV file
data.to_csv("data6.csv", index=False)