In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.externals import joblib
from xgboost import XGBClassifier 

In [2]:
mldf=pd.DataFrame(pd.read_csv('./Data/trainML.csv'))

In [3]:
Y=mldf['attack_code']
X=mldf.drop(columns=['attack_code'])

#### Building function to automate K-Fold cross validation

In [4]:
def k_fold_modelscore(x, y, model, shuffle=False, splits=3):
    kf=KFold(shuffle=shuffle, n_splits=splits)
    k=0
    df=pd.DataFrame(columns=['k', 'score'])
    for train, test in kf.split(x,y):
        X_train=X.loc[train, :]
        X_test=X.loc[test, :]
        y_train=y[train]
        y_test=y[test]
        model.fit(X_train, y_train)
        k+=1
        m_score=model.score(X_test, y_test)
        
        df.loc[len(df)]=[int(k), m_score]
    return df

## Logistic Regression

In [5]:
lrl2=LogisticRegression() # Logistic regression with L2 regularization
lrl1=LogisticRegression(penalty='l1') # Logistic regression with L1 regularization

In [6]:
print(k_fold_modelscore(X, Y, lrl1))

     k     score
0  1.0  0.998318
1  2.0  1.000000
2  3.0  0.995365


In [8]:
print(k_fold_modelscore(X, Y, lrl2))

     k     score
0  1.0  0.998297
1  2.0  1.000000
2  3.0  0.995514


#### We don't see any advantage of using L1 regularization so we will use L2 regularization

## Random Forest 

In [9]:
rf_params={'max_depth':[10, 20, 30]}
rf=RandomForestClassifier()
rf_gridsearch = GridSearchCV(rf, rf_params, cv=5, verbose=1, n_jobs=2)

In [10]:
X_train, X_test, y_train, y_test=train_test_split(X, Y)
rf_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed: 21.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'max_depth': [10, 20, 30]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=1)

In [11]:
rf_gridsearch.score(X_test, y_test)

0.9999884682357554

In [12]:
joblib.dump(rf_gridsearch, 'rf.pkl')

['rf.pkl']

## XGBoost

In [18]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [19]:
print(k_fold_modelscore(X, Y, model))

  if diff:
  if diff:
  if diff:


     k     score
0  1.0  0.998627
1  2.0  1.000000
2  3.0  0.939312
