# Lexical Filter-based Feature Selection

In [1]:
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import filter_feature_selection
import scipy.stats as stats

The URL dataset with generated lexical features are read from a csv file. These are then displayed for visualization purposes.

In [2]:
dataset = pd.read_csv('final_unbalanced_with_lexical.csv')

dataset.head()

Unnamed: 0,url_type,url_length,url_ip_in_domain,url_domain_entropy,url_is_digits_in_domain,url_query_length,url_number_of_parameters,url_number_of_digits,url_string_entropy,url_is_https,...,has_swf_in_string,has_cgi_in_string,has_php_in_string,has_abuse_in_string,has_admin_in_string,has_bin_in_string,has_personal_in_string,has_update_in_string,has_verification_in_string,url_scheme
0,2,16,0,3.169925,0,0,0,0,3.375,0,...,0,0,0,0,0,0,0,0,0,0
1,0,35,0,2.807355,1,0,0,1,4.079143,0,...,0,0,0,0,0,0,0,0,0,0
2,0,31,0,2.921928,0,0,0,1,3.708093,0,...,0,0,0,0,0,0,0,0,0,0
3,1,88,0,2.896292,0,49,4,7,4.660343,0,...,0,0,1,0,0,0,0,0,0,27
4,1,235,0,3.405822,0,194,3,22,4.980518,0,...,0,0,1,0,0,0,0,0,0,27


In [3]:
features = dataset.iloc[:, 1:]
url_type = dataset.iloc[:, 0]

In [4]:
features.head()

Unnamed: 0,url_length,url_ip_in_domain,url_domain_entropy,url_is_digits_in_domain,url_query_length,url_number_of_parameters,url_number_of_digits,url_string_entropy,url_is_https,url_path_length,...,has_swf_in_string,has_cgi_in_string,has_php_in_string,has_abuse_in_string,has_admin_in_string,has_bin_in_string,has_personal_in_string,has_update_in_string,has_verification_in_string,url_scheme
0,16,0,3.169925,0,0,0,0,3.375,0,16,...,0,0,0,0,0,0,0,0,0,0
1,35,0,2.807355,1,0,0,1,4.079143,0,35,...,0,0,0,0,0,0,0,0,0,0
2,31,0,2.921928,0,0,0,1,3.708093,0,31,...,0,0,0,0,0,0,0,0,0,0
3,88,0,2.896292,0,49,4,7,4.660343,0,10,...,0,0,1,0,0,0,0,0,0,27
4,235,0,3.405822,0,194,3,22,4.980518,0,10,...,0,0,1,0,0,0,0,0,0,27


In [5]:
url_type.head()

0    2
1    0
2    0
3    1
4    1
Name: url_type, dtype: int64

The respective F-score of the features are then calculated using ANOVA. The higher the F-score, the better a feature is.

In [6]:
anova_features, anova_f_scores, anova_p = filter_feature_selection.anova_feature_selection('final_unbalanced_with_lexical.csv')

  f = msb / msw


In [7]:
anova_p

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 nan,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.3363556808913688e-246,
 1.8308690110324366e-270,
 2.686065802099673e-269,
 0.0,
 0.0,
 0.0,
 2.693317015904387e-204,
 0.0,
 0.0,
 0.0,
 7.951631019114126e-14,
 0.0,
 4.975514650286889e-32,
 1.7316945019133305e-96,
 0.0,
 5.881839034303132e-131,
 2.0145948867605835e-12,
 4.463205248055142e-24,
 0.0,
 2.973837576321348e-226,
 0.0,
 0.0,
 7.4257812646223305e-140,
 0.0,
 2.353444594474617e-36,
 0.5788005781611546,
 0.0,
 nan,
 9.387204251055264e-114,
 0.0,
 7.573190044270151e-64,
 nan,
 6.727119043616304e-160,
 1.1506980460404001e-08,
 1.2063942516833e-83,
 5.113098970247169e-34,
 6.775038073800074e-129,
 2.2661288205864433e-186,
 9.188109551333743e-09,
 3.0783908755593823e-288,
 2.4890838507350864e-62,
 2.0629466737838045e-06,
 2.8070546125808786e-219,
 0.0,
 8.280974037281753e-44,
 0.0,
 1.9476165974719253e-256,
 1.4702930108098557e-85,
 2.782

In [8]:
anova_f_scores = [i for i in anova_f_scores if str(i) != 'nan']
anova_f_scores = np.array(anova_f_scores)

anova_f_scores

array([1.60765524e+05, 7.47897767e+04, 7.42824343e+04, 7.15210125e+04,
       6.75615286e+04, 6.71710644e+04, 6.63563647e+04, 6.63172775e+04,
       6.18316675e+04, 4.52817205e+04, 3.34201990e+04, 2.90991994e+04,
       2.08984704e+04, 1.61934945e+04, 1.57898157e+04, 1.06584877e+04,
       9.87602233e+03, 9.44600272e+03, 8.46484058e+03, 8.27513275e+03,
       8.07292369e+03, 7.45079177e+03, 6.81927446e+03, 3.74696210e+03,
       2.70266250e+03, 1.94717742e+03, 1.62341576e+03, 1.59797068e+03,
       1.54213039e+03, 1.42014433e+03, 1.08295453e+03, 1.03326941e+03,
       9.70359275e+02, 8.35195654e+02, 7.97792043e+02, 7.89538690e+02,
       7.43313742e+02, 5.97053012e+02, 5.74368125e+02, 4.44025916e+02,
       4.16668656e+02, 4.14873373e+02, 3.95080626e+02, 3.79942508e+02,
       3.54829634e+02, 3.48629593e+02, 3.37896155e+02, 3.14841459e+02,
       2.87258755e+02, 2.68239577e+02, 2.46523493e+02, 2.15680883e+02,
       2.01986459e+02, 1.98814056e+02, 1.75510318e+02, 1.48932766e+02,
      

Using the computed F-scores, the Z-scores are obtained so that the features with F-scores belonging in the upper 50 percentile are selected.

In [9]:
anova_z_scores = stats.zscore(anova_f_scores)

In [10]:
percentile = []
for z_score in anova_z_scores:
    percentile.append(stats.norm.cdf(z_score))

In [11]:
feature_percentile_dict = {}
print('Feature Percentile: ')
for i in range(len(percentile)):
    print(anova_features[i], ': ', percentile[i])
    if percentile[i] > 0.50:
        feature_percentile_dict[anova_features[i]] = percentile[i]

Feature Percentile: 
url_host_length :  0.9999999154054002
url_is_https :  0.9851449114414272
url_ip_in_domain :  0.984454144298416
has_php_in_string :  0.9801919264023486
url_number_of_parameters :  0.9723791520118684
has_exe_in_string :  0.9714855635190668
url_has_port :  0.969543544206523
url_num_equal :  0.9694476816886433
url_num_ampersand :  0.9566690361059697
url_is_digits_in_domain :  0.8696903322040741
url_path_length :  0.7590513353039839
url_num_question_mark :  0.7087181178585258
url_query_length :  0.6018388999372104
url_string_entropy :  0.5361937075903936
url_num_periods :  0.5304886673539473
url_num_forward_slash :  0.45782323516996204
url_domain_len :  0.44680580887724913
url_number_of_subdirectories :  0.440767844819599
has_linkeq_in_string :  0.4270448461906734
url_num_of_hyphens :  0.42440112619347753
url_number_of_letters :  0.4215869349576995
url_domain_entropy :  0.4129542634720332
url_length :  0.40423422463569514
url_num_underscore :  0.36256077818091026
has_jp

Shown below are the features with F-scores that belong in the upper 50 percentile.

In [12]:
feature_percentile_dict

{'url_host_length': 0.9999999154054002,
 'url_is_https': 0.9851449114414272,
 'url_ip_in_domain': 0.984454144298416,
 'has_php_in_string': 0.9801919264023486,
 'url_number_of_parameters': 0.9723791520118684,
 'has_exe_in_string': 0.9714855635190668,
 'url_has_port': 0.969543544206523,
 'url_num_equal': 0.9694476816886433,
 'url_num_ampersand': 0.9566690361059697,
 'url_is_digits_in_domain': 0.8696903322040741,
 'url_path_length': 0.7590513353039839,
 'url_num_question_mark': 0.7087181178585258,
 'url_query_length': 0.6018388999372104,
 'url_string_entropy': 0.5361937075903936,
 'url_num_periods': 0.5304886673539473}

Chi2 is used to determine the significance of categorical features. The higher the Chi2 statistics, the better a feature is and the lower its P value, the more significant the observation. Based on the results, both categorical features are to be retained.

In [13]:
filter_feature_selection.chi2_feature_selection('final_unbalanced_with_lexical.csv')

Chi2 Statistics: 
[88439618.85652827  6900658.46790684]
Chi2 P Values: 
[0. 0.]
get_tld       8.843962e+07
url_scheme    6.900658e+06
get_tld       0.0
url_scheme    0.0
dtype: float64


Features with high correlation are to be weeded out. This is to eliminate redundancy in the featureset.

In [14]:
features_high_correlation = filter_feature_selection.correlation_feature_selection('final_unbalanced_with_lexical.csv')

features_high_correlation

[['url_num_equal', 'url_num_ampersand', 'url_number_of_parameters'],
 ['url_num_forward_slash', 'url_number_of_subdirectories'],
 ['url_num_close_parenthesis', 'url_num_open_parenthesis'],
 ['url_domain_len', 'url_domain_entropy'],
 ['url_number_of_letters', 'url_length']]

In [15]:
to_remove = []

for feature in feature_percentile_dict:
    for high_corr_group in features_high_correlation:
        if feature in high_corr_group:
            for ref_feature in high_corr_group:
                if (ref_feature != feature) and (ref_feature in feature_percentile_dict.keys()):
                    if feature_percentile_dict[feature] >= feature_percentile_dict[ref_feature]:
                        print('Removed: ', ref_feature)
                        to_remove.append(ref_feature)
                        high_corr_group.remove(ref_feature)
                        #feature_percentile_dict.pop(ref_feature, None)
                    else:
                        print('Removed: ', feature)
                        to_remove.append(feature)
                        high_corr_group.remove(feature)
                        #feature_percentile_dict.pop(feature, None)

print(to_remove)

Removed:  url_num_equal
Removed:  url_num_ampersand
['url_num_equal', 'url_num_ampersand']


In [16]:
for feature in to_remove:
    feature_percentile_dict.pop(feature, None)

In [17]:
final_anova_features = [key for key in feature_percentile_dict.keys()]

final_anova_features

['url_host_length',
 'url_is_https',
 'url_ip_in_domain',
 'has_php_in_string',
 'url_number_of_parameters',
 'has_exe_in_string',
 'url_has_port',
 'url_is_digits_in_domain',
 'url_path_length',
 'url_num_question_mark',
 'url_query_length',
 'url_string_entropy',
 'url_num_periods']

After combining the results of ANOVA and Chi2 and removing the highly correlated features, the final featureset is seen below.

In [18]:
final_features = final_anova_features + ['get_tld'] + ['url_scheme'] # ADD CATEGORICAL IF GOOD CHI2 SCORE AND SIGNIFICANT BASED ON P VALUE | WILL AUTOMATE LATER

final_features

['url_host_length',
 'url_is_https',
 'url_ip_in_domain',
 'has_php_in_string',
 'url_number_of_parameters',
 'has_exe_in_string',
 'url_has_port',
 'url_is_digits_in_domain',
 'url_path_length',
 'url_num_question_mark',
 'url_query_length',
 'url_string_entropy',
 'url_num_periods',
 'get_tld',
 'url_scheme']