# Content-based Filter-based Feature Selection

In [1]:
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import filter_feature_selection
import scipy.stats as stats

The URL dataset with generated lexical and content-based features are read from a csv file. The URL type are changed such that the classification is binary. These are then displayed for visualization purposes.

In [2]:
dataset = pd.read_csv('unbalanced_with_content.csv')

dataset['url_type'] = dataset['url_type'].replace({
    2: 1,
    3: 1,
})

dataset.head()

Unnamed: 0,url_type,blank_lines_count,blank_spaces_count,word_count,average_word_len,webpage_size,webpage_entropy,js_count,sus_js_count,js_eval_count,...,has_swf_in_string,has_cgi_in_string,has_php_in_string,has_abuse_in_string,has_admin_in_string,has_bin_in_string,has_personal_in_string,has_update_in_string,has_verification_in_string,url_scheme
0,1,14,36,0,0.0,1186,5.269303,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,415,3952,11450,4.68393,26155,4.565537,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,5715,88450,420,5.833333,339327,4.656704,13,10,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1461,6292,3049,4.515907,89249,5.14958,11,2,0,...,0,0,0,0,0,0,0,0,0,0
4,0,43,273,0,0.0,1530,4.754726,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
features = dataset.iloc[:, 1:]
url_type = dataset.iloc[:, 0]

In [4]:
features.head()

Unnamed: 0,blank_lines_count,blank_spaces_count,word_count,average_word_len,webpage_size,webpage_entropy,js_count,sus_js_count,js_eval_count,js_escape_count,...,has_swf_in_string,has_cgi_in_string,has_php_in_string,has_abuse_in_string,has_admin_in_string,has_bin_in_string,has_personal_in_string,has_update_in_string,has_verification_in_string,url_scheme
0,14,36,0,0.0,1186,5.269303,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,415,3952,11450,4.68393,26155,4.565537,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5715,88450,420,5.833333,339327,4.656704,13,10,1,2,...,0,0,0,0,0,0,0,0,0,0
3,1461,6292,3049,4.515907,89249,5.14958,11,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,43,273,0,0.0,1530,4.754726,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
url_type.head()

0    1
1    0
2    0
3    0
4    0
Name: url_type, dtype: int64

In [6]:
dataset.to_csv('binary_unbalanced_with_content.csv', encoding='utf-8', index=False)

The respective F-score of the features are then calculated using ANOVA. The higher the F-score, the better a feature is.

In [7]:
anova_features, anova_f_scores, anova_p = filter_feature_selection.anova_feature_selection('binary_unbalanced_with_content.csv')

  f = msb / msw


In [8]:
anova_p

[5.969831635586884e-246,
 3.849399758281125e-224,
 0.273337725175008,
 2.8445856553034404e-10,
 0.0,
 0.0,
 0.0,
 0.0,
 2.6502125178307038e-21,
 6.7546149922751656e-195,
 1.8391303999337112e-38,
 0.0,
 1.0391106712903857e-160,
 0.0,
 0.0,
 nan,
 0.0,
 0.0,
 0.0,
 2.8728835121005203e-25,
 8.679039881613183e-14,
 0.0,
 0.0,
 8.250280900076229e-50,
 0.0,
 0.16515384658925691,
 5.659785990496566e-11,
 1.8866454190019523e-168,
 0.06021577282767681,
 0.0,
 4.0290547303044796e-07,
 0.0,
 6.065849218749003e-148,
 0.0,
 2.773195871558155e-19,
 2.383707275090063e-139,
 4.74935065504685e-63,
 2.7331386510427133e-116,
 4.602244412529603e-210,
 0.0,
 2.7761508023311643e-140,
 0.0,
 0.0,
 1.2047258233044411e-45,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 2.1635164812018333e-213,
 0.0,
 5.718104334992572e-102,
 nan,
 2.366589431986595e-102,
 3.979468347875503e-97,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.7237846137697123e-22,
 3.338695266911339e-177,
 1.2188257800496201e-129,
 1.742752053941192e-128,
 7.718853947273845e

In [9]:
anova_f_scores = [i for i in anova_f_scores if str(i) != 'nan']
anova_f_scores = np.array(anova_f_scores)

anova_f_scores

array([6.84994551e+04, 5.91321309e+04, 1.84218835e+04, 1.45004228e+04,
       1.38948981e+04, 8.89482594e+03, 8.28685582e+03, 8.06087635e+03,
       7.24015880e+03, 6.75957143e+03, 6.59028765e+03, 6.17976826e+03,
       6.15082909e+03, 6.13820282e+03, 6.10119861e+03, 5.96726646e+03,
       4.72398533e+03, 4.33634778e+03, 4.29647961e+03, 4.26848243e+03,
       3.06316497e+03, 2.70787939e+03, 2.54747359e+03, 2.52955198e+03,
       2.47691935e+03, 2.25135143e+03, 2.08997727e+03, 2.04338740e+03,
       2.04273851e+03, 1.86246606e+03, 1.78357328e+03, 1.50737973e+03,
       1.12528210e+03, 1.02434797e+03, 9.74621877e+02, 9.59231397e+02,
       8.89108369e+02, 8.86769769e+02, 8.07341940e+02, 7.66914290e+02,
       7.31167021e+02, 6.79722850e+02, 6.72234001e+02, 6.36883124e+02,
       6.32574574e+02, 5.87788043e+02, 5.82459727e+02, 5.26228202e+02,
       4.61999607e+02, 4.60234583e+02, 4.40795522e+02, 4.37928269e+02,
       3.29075914e+02, 2.93440478e+02, 2.81130205e+02, 2.38395632e+02,
      

Using the computed F-scores, the Z-scores are obtained so that the features with F-scores belonging in the upper 50 percentile are selected.

In [10]:
anova_z_scores = stats.zscore(anova_f_scores)

In [11]:
percentile = []
for z_score in anova_z_scores:
    percentile.append(stats.norm.cdf(z_score))

In [12]:
feature_percentile_dict = {}
print('Feature Percentile: ')
for i in range(len(percentile)):
    print(anova_features[i], ': ', percentile[i])
    if percentile[i] > 0.50:
        feature_percentile_dict[anova_features[i]] = percentile[i]

Feature Percentile: 
url_host_length :  0.9999999999997691
url_is_https :  0.999999999721014
url_num_periods :  0.9561023673836623
url_path_length :  0.8987010463117999
js_count :  0.8863475174954547
has_log_in_html :  0.7438575589628358
meta_tag_count :  0.7217873819884608
js_search_count :  0.7133539307748362
url_num_forward_slash :  0.6817493320580099
sus_js_count :  0.6625876350118195
js_link_count :  0.6557330216305041
webpage_entropy :  0.6389003081819309
url_num_ampersand :  0.6377030304878419
url_num_subdomain :  0.6371802303608436
webpage_size :  0.6356465733645736
has_free_in_html :  0.630077677271735
has_php_in_string :  0.5772218449296407
url_number_of_parameters :  0.5604038414109868
url_num_equal :  0.5586673889741649
url_num_of_hyphens :  0.5574472993360304
url_query_length :  0.5045574371172142
url_domain_entropy :  0.4889099564172396
url_number_of_subdirectories :  0.4818495145812649
title_tag_presence :  0.4810609877392944
url_string_entropy :  0.4787456613571104
hype

Shown below are the features with F-scores that belong in the upper 50 percentile.

In [13]:
feature_percentile_dict

{'url_host_length': 0.9999999999997691,
 'url_is_https': 0.999999999721014,
 'url_num_periods': 0.9561023673836623,
 'url_path_length': 0.8987010463117999,
 'js_count': 0.8863475174954547,
 'has_log_in_html': 0.7438575589628358,
 'meta_tag_count': 0.7217873819884608,
 'js_search_count': 0.7133539307748362,
 'url_num_forward_slash': 0.6817493320580099,
 'sus_js_count': 0.6625876350118195,
 'js_link_count': 0.6557330216305041,
 'webpage_entropy': 0.6389003081819309,
 'url_num_ampersand': 0.6377030304878419,
 'url_num_subdomain': 0.6371802303608436,
 'webpage_size': 0.6356465733645736,
 'has_free_in_html': 0.630077677271735,
 'has_php_in_string': 0.5772218449296407,
 'url_number_of_parameters': 0.5604038414109868,
 'url_num_equal': 0.5586673889741649,
 'url_num_of_hyphens': 0.5574472993360304,
 'url_query_length': 0.5045574371172142}

Chi2 is used to determine the significance of categorical features. The higher the Chi2 statistics, the better a feature is and the lower its P value, the more significant the observation. Based on the results, both categorical features are to be retained.

In [14]:
filter_feature_selection.chi2_feature_selection('binary_unbalanced_with_content.csv')

Chi2 Statistics: 
[13181359.33613444  1116845.9066905 ]
Chi2 P Values: 
[0. 0.]
get_tld       1.318136e+07
url_scheme    1.116846e+06
get_tld       0.0
url_scheme    0.0
dtype: float64


Features with high correlation are to be weeded out. This is to eliminate redundancy in the featureset.

In [15]:
features_high_correlation = filter_feature_selection.correlation_feature_selection('binary_unbalanced_with_content.csv')

features_high_correlation

[['url_number_of_letters', 'url_number_of_digits', 'url_length'],
 ['url_num_ampersand', 'url_num_equal', 'url_number_of_parameters'],
 ['url_domain_len', 'url_domain_entropy'],
 ['url_num_close_parenthesis', 'url_num_open_parenthesis']]

In [16]:
to_remove = []

for feature in feature_percentile_dict:
    for high_corr_group in features_high_correlation:
        if feature in high_corr_group:
            for ref_feature in high_corr_group:
                if (ref_feature != feature) and (ref_feature in feature_percentile_dict.keys()):
                    if feature_percentile_dict[feature] >= feature_percentile_dict[ref_feature]:
                        print('Removed: ', ref_feature)
                        to_remove.append(ref_feature)
                        high_corr_group.remove(ref_feature)
                        #feature_percentile_dict.pop(ref_feature, None)
                    else:
                        print('Removed: ', feature)
                        to_remove.append(feature)
                        high_corr_group.remove(feature)
                        #feature_percentile_dict.pop(feature, None)

print(to_remove)

Removed:  url_num_equal
Removed:  url_number_of_parameters
['url_num_equal', 'url_number_of_parameters']


In [17]:
for feature in to_remove:
    feature_percentile_dict.pop(feature, None)

In [18]:
final_anova_features = [key for key in feature_percentile_dict.keys()]

final_anova_features

['url_host_length',
 'url_is_https',
 'url_num_periods',
 'url_path_length',
 'js_count',
 'has_log_in_html',
 'meta_tag_count',
 'js_search_count',
 'url_num_forward_slash',
 'sus_js_count',
 'js_link_count',
 'webpage_entropy',
 'url_num_ampersand',
 'url_num_subdomain',
 'webpage_size',
 'has_free_in_html',
 'has_php_in_string',
 'url_num_of_hyphens',
 'url_query_length']

After combining the results of ANOVA and Chi2 and removing the highly correlated features, the final featureset is seen below.

In [19]:
final_features = final_anova_features + ['get_tld'] + ['url_scheme'] # ADD CATEGORICAL IF GOOD CHI2 SCORE AND SIGNIFICANT BASED ON P VALUE | WILL AUTOMATE LATER

final_features

['url_host_length',
 'url_is_https',
 'url_num_periods',
 'url_path_length',
 'js_count',
 'has_log_in_html',
 'meta_tag_count',
 'js_search_count',
 'url_num_forward_slash',
 'sus_js_count',
 'js_link_count',
 'webpage_entropy',
 'url_num_ampersand',
 'url_num_subdomain',
 'webpage_size',
 'has_free_in_html',
 'has_php_in_string',
 'url_num_of_hyphens',
 'url_query_length',
 'get_tld',
 'url_scheme']