In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
class UrlDatasetLoader():
    
    def __init__(self):
        print('init Loader notebook')
    
    def load_data(self, url="https://raw.githubusercontent.com/quickheaven/scs-3253-machine-learning/master/datasets/ISCX-URL2016_All.csv"):     
        """    
        (string) --> dataframe

        This function returns the dataframe of maliciours url.    

        Parameters
        ----------
        url: By default, it fetch the data from github otherwise a local path or url can be provided to load the data frame faster.
        """
        df = pd.read_csv(url)
        return df

    def prepare_data(self, data, fill_na=True, feature_selection=True):       
        """
        (DataFrame, boolean, boolean) --> X and y of the dataframe.

        This function returns the X and y of the malicious url dataframe.

        Parameters
        ----------
        fill_na : True to fill the na records with mean values otherwise drop the features.

        feature_selection : True to remove one or more features that have a correlation higher than 0.9 othewise do not perform that kind of feature selection.
        """
        from sklearn.preprocessing import MinMaxScaler
        from sklearn.preprocessing import LabelEncoder

        data = data.copy()

        y_feature = 'URL_Type_obf_Type'

        if (fill_na == True):
            data['avgpathtokenlen'] = data['avgpathtokenlen'].fillna(data['avgpathtokenlen'].mean())
            data['NumberRate_DirectoryName'] = data['NumberRate_DirectoryName'].fillna(data['NumberRate_DirectoryName'].mean())
            data['NumberRate_FileName'] = data['NumberRate_FileName'].fillna(data['NumberRate_FileName'].mean())
            data['NumberRate_Extension'] = data['NumberRate_Extension'].fillna(data['NumberRate_Extension'].mean())
            data['NumberRate_AfterPath'] = data['NumberRate_AfterPath'].fillna(data['NumberRate_AfterPath'].mean())
            data['Entropy_DirectoryName'] = data['Entropy_DirectoryName'].fillna(data['Entropy_DirectoryName'].mean())
            data['Entropy_Filename'] = data['Entropy_Filename'].fillna(data['Entropy_Filename'].mean())
            data['Entropy_Extension'] = data['Entropy_Extension'].fillna(data['Entropy_Extension'].mean())
            data['Entropy_Afterpath'] = data['Entropy_Afterpath'].fillna(data['Entropy_Afterpath'].mean())
        else:
            data.dropna(axis='index', inplace=True)

        data = data.drop("argPathRatio", axis=1) # TODO simply drop this for now

        le = LabelEncoder()

        data[y_feature] = le.fit_transform(data[y_feature]) 

        if (feature_selection == True):
            corr = data.corr()        
            # Selecting features based on correlation:
            # compare the correlation between features and remove one of more features that have a correlation higher than 0.9
            # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf
            columns = np.full((corr.shape[0],), True, dtype=bool)
            for i in range(corr.shape[0]):
                for j in range(i+1, corr.shape[0]):
                    if corr.iloc[i,j] >= 0.9:
                        if columns[j]:
                            columns[j] = False
            selected_columns = data.columns[columns]
            data = data[selected_columns]            

        scaler = MinMaxScaler()

        X = pd.DataFrame(scaler.fit_transform(data.loc[:, data.columns != y_feature]), columns=data.columns[:-1] )

        y = data[y_feature]                    

        return X, y


In [3]:
import nbimporter
import loader_nb

loader = loader_nb.UrlDatasetLoader()

init Loader notebook


In [4]:
loader.load_data()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,4,5,5.500000,14,4.400000,4,8,3,0,...,1,0,-1,0.726298,0.784493,0.894886,0.850608,,-1.000000,Defacement
1,0,4,5,5.500000,14,6.000000,4,12,4,0,...,0,0,-1,0.688635,0.784493,0.814725,0.859793,0.000000,-1.000000,Defacement
2,0,4,5,5.500000,14,5.800000,4,12,5,0,...,0,0,-1,0.695049,0.784493,0.814725,0.801880,0.000000,-1.000000,Defacement
3,0,4,12,5.500000,14,5.500000,4,32,16,0,...,0,0,-1,0.640130,0.784493,0.814725,0.663210,0.000000,-1.000000,Defacement
4,0,4,6,5.500000,14,7.333334,4,18,11,0,...,0,0,-1,0.681307,0.784493,0.814725,0.804526,0.000000,-1.000000,Defacement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36702,29,4,14,5.750000,12,3.666667,4,20,24,3,...,3,2,7,0.690555,0.791265,0.777498,0.690227,0.656684,0.796205,spam
36703,0,4,13,3.750000,8,8.461538,4,24,23,0,...,16,15,-1,0.665492,0.820010,0.879588,0.674400,0.674671,-1.000000,spam
36704,58,3,27,6.666666,16,3.375000,3,41,34,20,...,8,7,9,0.656807,0.801139,0.684777,0.713622,0.717187,0.705245,spam
36705,35,3,13,4.333334,9,3.600000,3,15,13,7,...,9,8,3,0.725963,0.897617,0.871049,0.745932,0.758824,0.790772,spam
