In [None]:
%load_ext autoreload
%autoreload 2

In [91]:
import pandas as pd
import numpy as np
import re

In [92]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline, FeatureUnion

In [93]:
from mlxtend.feature_selection import ColumnSelector

In [94]:
df = pd.read_csv('dataset.csv').fillna('None')

In [95]:
df.drop(['CONTENT_LENGTH', 'URL', 'WHOIS_UPDATED_DATE', 'WHOIS_REGDATE'], axis = 1, inplace = True);

In [96]:
def is_in_list(x, listed):
    if x in listed:
        return x
    else:
        return 'other'

In [97]:
def server(string):
    string = string.lower()
    regex_exp = r'(apache|nginx|microsoft)([^\/]+|)(\/|)(\d+\.|\d+\b|)+'
    found = re.search(regex_exp,
                      string)
    if found:
        server = found.group(0).replace(found.group(2),'')
    else:
        server = 'none'
    return server

In [98]:
df['SERVER'] = df['SERVER'].apply(server)
df['DNS_QUERY_TIMES'] = df['DNS_QUERY_TIMES'].replace('None','0').astype(float)

In [99]:
WHOIS_COUNTRY_keep  = ['none','es', 'us', 'cz', 'ru', 'gb', 'cn']
WHOIS_STATEPRO_keep = ['none',
                       'barcelona',
                       'utah',
                       'california',
                       'praha',
                       'wc1n',
                       'krasnoyarsk',
                       'ca',
                       'pa',
                       'washington',
                       'wa',
                       'panama',
                       'arizona',
                       'on',
                       'montevideo',
                       'beijingshi']

In [100]:
df['WHOIS_COUNTRY'] = df['WHOIS_COUNTRY'].str.lower().apply(is_in_list, args = (WHOIS_COUNTRY_keep,))
df['WHOIS_STATEPRO'] = df['WHOIS_STATEPRO'].str.lower().apply(is_in_list, args = (WHOIS_STATEPRO_keep,))

In [101]:
df.describe(include=object)

Unnamed: 0,CHARSET,SERVER,WHOIS_COUNTRY,WHOIS_STATEPRO
count,1781,1781,1781,1781
unique,9,77,8,17
top,UTF-8,none,us,other
freq,676,504,1106,664


In [102]:
cat = df.select_dtypes(object).columns.tolist()
num = df.drop('Type', axis = 1).select_dtypes(exclude = object).columns.tolist()
transform = FeatureUnion([
                        ('encoder',   Pipeline([
                            ('select_columns', ColumnSelector(cat)),
                            ('compress_feature', OneHotEncoder(handle_unknown='ignore', sparse = False))
                        ])),
                        ('normalize', Pipeline([
                            ('select_columns', ColumnSelector(num)),
                            ('count_features', StandardScaler())
                        ]))
            ])

In [103]:
matrix = transform.fit_transform(df)

In [104]:
x_train, x_test, y_train, y_test = train_test_split(matrix, df['Type'].values, test_size=0.3, random_state = 8)

In [105]:
lr = LogisticRegressionCV(cv = 12, max_iter = 2000)
rf = RandomForestClassifier(n_estimators = 30, max_depth = 5)
nn = MLPClassifier(hidden_layer_sizes = (123, 31, 8,), verbose = False)

In [106]:
lr.fit(x_train, y_train);
rf.fit(x_train, y_train);
nn.fit(x_train, y_train);

In [107]:
print(lr.score(x_test,y_test))
print(rf.score(x_test,y_test))
print(nn.score(x_test,y_test))

0.9607476635514018
0.9327102803738317
0.9757009345794393


In [114]:
with open('y_test.txt', 'w') as f:
    for v in y_test:
        f.write(str(v)+'\n')

y_prob = lr.predict_proba(x_test)[:,1]
with open('y_prob.txt', 'w') as f:
    for v in y_prob:
        f.write(str(v)+'\n')
        
y_pred = lr.predict(x_test)
with open('y_pred.txt', 'w') as f:
    for v in y_prob:
        f.write(str(v)+'\n')