## Table of Contents
* [Feature Explorations](#1)
* [Target vs Features](#2)
* [Fit Model](#3)
* [Evaluate Model](#4)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning/H2O
import h2o
from h2o.estimators import H2ORandomForestEstimator

In [None]:
# files
!ls -l ../input/internet-firewall-data-set/

In [None]:
# load data
df = pd.read_csv('../input/internet-firewall-data-set/log2.csv')
df.head()

In [None]:
# structure of data frame
df.info()

### Target Distribution

In [None]:
# target distribution
print('Absolute Frequencies:')
print(df.Action.value_counts())
print()

print('Percentages:')
print(df.Action.value_counts(normalize=True))

df.Action.value_counts().plot(kind='bar')
plt.title('Target (Action)')
plt.grid()
plt.show()

#### Action "reset-both" is very rare, this will be extremely hard to predict...

<a id='1'></a>
# Feature Exploration

### Numerical Features

In [None]:
features_num = ['Bytes', 'Bytes Sent', 'Bytes Received',
                'Packets', 'Elapsed Time (sec)', 
                'pkts_sent', 'pkts_received']

In [None]:
# define log trafo for numerical features
def num_trafo(x):
    return np.log10(1+x)

In [None]:
# plot distribution of numerical features
for f in features_num:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11,7), sharex=True)
    ax1.hist(num_trafo(df[f]), bins=20)
    ax1.grid()
    ax1.set_title('Feature: ' + f + ' - trafo [log_10(1+x)]')
    ax2.boxplot(num_trafo(df[f]), vert=False)
    ax2.grid()   
    ax2.set_title('Feature: ' + f + ' - trafo [log_10(1+x)]')
    plt.show()

### Categorical Features

In [None]:
features_cat = ['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']

#### Remark: NAT = Network Address Translation

In [None]:
# show only top 10 levels for each feature
for f in features_cat:
    print('Feature:', f)
    print(df[f].value_counts()[0:10])
    print()
    df[f].value_counts()[0:10].plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

In [None]:
# Source/Destination
plt.figure(figsize=(7,7))
plt.scatter(df['Source Port'], df['Destination Port'], alpha=0.05)
plt.xlabel('Source Port')
plt.ylabel('Destination Port')
plt.show()

In [None]:
# Source/Destination NAT (Network Address Translation)
plt.figure(figsize=(7,7))
plt.scatter(df['NAT Source Port'], df['NAT Destination Port'], alpha=0.05)
plt.xlabel('NAT Source Port')
plt.ylabel('NAT Destination Port')
plt.show()

<a id='2'></a>
# Target vs Features

### Numerical Features

In [None]:
# add transformations of numerical features
for f in features_num:
    new_feature = f + '_trafo'
    df[new_feature] = num_trafo(df[f])
    
features_num_trafo = [f+'_trafo' for f in features_num]

In [None]:
# plot features distribution by target level
for f in features_num_trafo: # use transformed features for plot
    plt.figure(figsize=(10,6))
    sns.violinplot(x=f, y='Action', data=df)
    my_title = 'Distribution by Action for ' + f
    plt.title(my_title)
    plt.grid()

### Categorical Features

In [None]:
# visualize crosstable target vs feature (using top 10 levels only)
for f in features_cat:
    top10_levels = df[f].value_counts()[0:10].index.to_list()
    df_temp = df[df[f].isin(top10_levels)]
    ctab = pd.crosstab(df_temp.Action, df_temp[f])
    print('Feature:' + f + ' - Top 10 levels only')
    plt.figure(figsize=(12,5))
    sns.heatmap(ctab, annot=True, fmt='d', 
                cmap='Blues',
                linecolor='black',
                linewidths=0.1)
    plt.show()

### Source/Destination plots split by target

In [None]:
# source/destination plot by Action
xx = 'Source Port'
yy = 'Destination Port'

fig, axs = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(10,10))

df_temp = df[df.Action=='allow']
axs[0,0].scatter(df_temp[xx], df_temp[yy], alpha=0.05)
axs[0,0].set_title('Action = allow')
axs[0,0].set_xlabel(xx)
axs[0,0].set_ylabel(yy)
axs[0,0].grid()

df_temp = df[df.Action=='deny']
axs[0,1].scatter(df_temp[xx], df_temp[yy], alpha=0.05)
axs[0,1].set_title('Action = deny')
axs[0,1].set_xlabel(xx)
axs[0,1].set_ylabel(yy)
axs[0,1].grid()

df_temp = df[df.Action=='drop']
axs[1,0].scatter(df_temp[xx], df_temp[yy], alpha=0.5)
axs[1,0].set_title('Action = drop')
axs[1,0].set_xlabel(xx)
axs[1,0].set_ylabel(yy)
axs[1,0].grid()

df_temp = df[df.Action=='reset-both']
axs[1,1].scatter(df_temp[xx], df_temp[yy], alpha=0.5)
axs[1,1].set_title('Action = reset-both')
axs[1,1].set_xlabel(xx)
axs[1,1].set_ylabel(yy)
axs[1,1].grid()

plt.show()

In [None]:
# source/destination plot by Action - NAT (Network Address Translation) version
xx = 'NAT Source Port'
yy = 'NAT Destination Port'

fig, axs = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(10,10))

df_temp = df[df.Action=='allow']
axs[0,0].scatter(df_temp[xx], df_temp[yy], alpha=0.05)
axs[0,0].set_title('Action = allow')
axs[0,0].set_xlabel(xx)
axs[0,0].set_ylabel(yy)
axs[0,0].grid()

df_temp = df[df.Action=='deny']
axs[0,1].scatter(df_temp[xx], df_temp[yy], alpha=0.5)
axs[0,1].set_title('Action = deny')
axs[0,1].set_xlabel(xx)
axs[0,1].set_ylabel(yy)
axs[0,1].grid()

df_temp = df[df.Action=='drop']
axs[1,0].scatter(df_temp[xx], df_temp[yy], alpha=0.5)
axs[1,0].set_title('Action = drop')
axs[1,0].set_xlabel(xx)
axs[1,0].set_ylabel(yy)
axs[1,0].grid()

df_temp = df[df.Action=='reset-both']
axs[1,1].scatter(df_temp[xx], df_temp[yy], alpha=0.5)
axs[1,1].set_title('Action = reset-both')
axs[1,1].set_xlabel(xx)
axs[1,1].set_ylabel(yy)
axs[1,1].grid()

plt.show()

<a id='3'></a>
# Fit Model

In [None]:
# init H2O
h2o.init(max_mem_size='12G', nthreads=4)

In [None]:
# upload data frame in H2O environment
t1 = time.time()
df_hex = h2o.H2OFrame(df)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# define target
target = 'Action'
# select features
features = features_num_trafo + features_cat
print('Features used:', features)
# explicitly convert target to categorical => classification problem
df_hex[target] = df_hex[target].asfactor()

### Train/Test Split

In [None]:
# train / test split
train_perc = 0.7
train_hex, test_hex = df_hex.split_frame(ratios=[train_perc], seed=999)

In [None]:
# check target distribution in train set
train_hex[target].as_data_frame().value_counts()

In [None]:
# check target distribution in test set
test_hex[target].as_data_frame().value_counts()

In [None]:
# define (distributed) random forest model
n_cv = 5
fit_DRF = H2ORandomForestEstimator(ntrees=5,
                                   max_depth=20,
                                   min_rows=5,
                                   nfolds=n_cv,
                                   score_each_iteration=True,
                                   stopping_metric='logloss',
                                   stopping_rounds=5,
                                   stopping_tolerance=1e-4,
                                   seed=999)

# train model
t1 = time.time()
fit_DRF.train(x=features,
              y=target,
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# cross validation metrics
fit_DRF.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_DRF.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [logloss]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_logloss, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_logloss, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('logloss')
    plt.ylim(0,0.1)
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# variable importance
fit_DRF.varimp_plot()

<a id='4'></a>
# Evaluate Model

### Performance on Training Set

In [None]:
# predict on training set
pred_train = fit_DRF.predict(train_hex)
# add actual target
pred_train['target'] = train_hex[target]
pred_train = pred_train.as_data_frame()
# preview
pred_train.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_train = pd.crosstab(pred_train['target'], pred_train['predict'])
# visualize
sns.heatmap(conf_train, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

### Performance on Test Set

In [None]:
# predict
pred_test = fit_DRF.predict(test_hex)
# add actual target
pred_test['target'] = test_hex[target]
pred_test = pred_test.as_data_frame()
pred_test.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_test = pd.crosstab(pred_test['target'], pred_test['predict'])
# visualize
sns.heatmap(conf_test, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()