# Read Directories

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Import Liberaries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDClassifier,RidgeClassifier
from sklearn.metrics import (precision_score, recall_score,f1_score)
from sklearn.metrics import average_precision_score

In [None]:
######## Base
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', None)

######### Warning ##############
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)


########## Sklearn #############
# Pre-processing
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
# Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
# Models
from sklearn.linear_model import LogisticRegression     # Logistic Regression
from sklearn.naive_bayes import GaussianNB              # Naive Bayes
from sklearn.neighbors import KNeighborsClassifier      # KNN 
from sklearn.svm import SVC                             # SVC 
from sklearn import tree                                # CART - Sınıflandırma ve Regresyon Ağaçları
from sklearn.tree import DecisionTreeClassifier         # CART - Sınıflandırma ve Regresyon Ağaçları
from sklearn.ensemble import BaggingClassifier          # Bagging
from sklearn.ensemble import VotingClassifier           # Voting 
from sklearn.ensemble import RandomForestClassifier     # Random Forest
from sklearn.ensemble import AdaBoostClassifier         # Ada Boost
from sklearn.ensemble import GradientBoostingClassifier # GBM - Gradient Boosting Machine
from xgboost import XGBClassifier                       # XGBoost | !pip install xgboost
from lightgbm import LGBMClassifier                     # LightGBM | !conda install -c conda-forge lightgbm
from catboost import CatBoostClassifier                 # CatBoost | !pip install catboost
!pip install --upgrade nboost                           # NGBoost
!pip install --upgrade git+https://github.com/stanfordmlgroup/ngboost.git
from ngboost import NGBClassifier
from ngboost.distns import k_categorical, Bernoulli

# 2. Loading Data

In [None]:
train=pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_training-set.csv')
test=pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.csv')


# 3. Data Visualization

In [None]:
# Print the number of train / test samples
print(f"Train data length: {len(train)}")
print(f"Test data length: {len(test)}")

# Visualise the distribution of attacks and normal traffic

f, axes = plt.subplots(2, 2, figsize=(12, 10))

# Create the plots
sns.countplot(x="label", data=train, ax=axes[0,0])
sns.countplot(x="label", data=test, ax=axes[0,1])
sns.countplot(x="attack_cat", data=train, ax=axes[1,0], order = train['attack_cat'].value_counts().index)
sns.countplot(x="attack_cat", data=test, ax=axes[1,1], order = test['attack_cat'].value_counts().index)

# Set the plot titles
axes[0,0].set_title("Training data distribution")
axes[1,0].set_title("Training data distribution")
axes[0,1].set_title("Testing data distribution")
axes[1,1].set_title("Testing data distribution")

# Rotate xticks for readability
axes[1,0].tick_params('x', labelrotation=45)
axes[1,1].tick_params('x', labelrotation=45)

# Change the xtick labels for attack / normal
axes[0,0].set_xticklabels(["Normal", "Attack"])
axes[0,1].set_xticklabels(["Normal", "Attack"])

# Remove xlabels
axes[0,0].set_xlabel("")
axes[0,1].set_xlabel("")
axes[1,0].set_xlabel("")
axes[1,1].set_xlabel("")

# Add some space between the plots for y labels
plt.subplots_adjust(wspace=0.25)

# 4. Dataset Observation

In [None]:
train.head()

In [None]:
train.shape,test.shape

In [None]:
train.info()

# 3. Data Preprocessing

## Null Value check

In [None]:
train.isnull().sum()

## Categorical variables

In [None]:
mask = (train.dtypes == np.object)
print(train.loc[:,mask].head())
list_cat = train.loc[:,mask].columns.tolist()
print(list_cat)
print(train.loc[:,mask].values)

## Numeric variables

In [None]:
mask = (train.dtypes != np.object)
print(train.loc[:,mask].head())
list_cat = train.loc[:,mask].columns.tolist()
print(list_cat)
train.loc[:,mask].describe()


## Evaluation of the training dataset

In [None]:
#  Check whether the positive label (1) match attack categories, and whether attack categories match labelled data.

# all(iterable) returns True if all elements of the iterable are considered as true values
print(all(((train.label == 1) & (train.attack_cat != 'Normal')) == (train.attack_cat != 'Normal')))
print(all(((train.attack_cat != 'Normal') & (train.label == 1)) == (train.label == 1)))

## number of occurrences for each attack category

In [None]:
train.attack_cat.value_counts()

## Which protocols and services appear in the positively labelled entries?

In [None]:
mask = (train.label == 1)
print(train.loc[mask,:].service.value_counts())
print(train.loc[mask,:].proto.value_counts())

## In the negatively labelled ones?

In [None]:
mask = (train.label == 0)
print(train.loc[mask,:].service.value_counts())
print(train.loc[mask,:].proto.value_counts())

# Data cleaning

In [None]:
print(train.columns.values)
print(test.columns.values)

* Scaling the Data before doing anomoly detection
* As anomoly detection methods works better with scaled data, but there is no compulsory need to do so.
* Scale only continious data

In [None]:
df = pd.concat([train, test], ignore_index=True)

# Remove unwanted columns
df.drop(['id', 'attack_cat'], inplace=True, axis=1)

# Perform one-hot encoding on categorical columns and join back to main train_data
one_hot = pd.get_dummies(df[["proto", "state", "service"]])
df = df.join(one_hot)

# Remove the original categorical columns
df.drop(["proto", "state", "service"], inplace=True, axis=1)

# Re split the data back into train / test
train_data = df.iloc[0:175341, 0:]
test_data = df.iloc[175341:, 0:]

# Create y_train and then drop the label from the training data
y_train = np.array(train_data["label"])
train_data.drop(['label'], inplace=True, axis=1)

y_test = np.array(test_data["label"])
test_data.drop(['label'], inplace=True, axis=1)

# Use min-max scaler to scale the features to 0-1 range
# Only fit the scaler on the train data!!
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train_data)

# Scale the testing data
X_test = scaler.transform(test_data)

# Ensure our dataset splits are still correct
print(f"Train data shape: {X_train.shape} Train label shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape} Test label shape: {y_test.shape}")

# 5. Machine Learning Analysis

In [None]:
# Logistic Regression
log = LogisticRegression(solver = "liblinear")
y_pred_log_fit = log.fit(X_train, y_train)
y_pred_log = y_pred_log_fit.predict(X_test)
log_accuracy = accuracy_score(y_test, y_pred_log)

In [None]:
log_accuracy

In [None]:
print ("Accuracy: " + str(accuracy_score(y_pred_log, y_test)))
print ("Precision: " + str(precision_score(y_pred_log, y_test)))
print ("Recall: " + str(recall_score(y_pred_log, y_test)))
print ("F1: " + str(f1_score(y_pred_log, y_test)))

In [None]:
confusion_matrix(y_test, y_pred_log)

In [None]:
print(classification_report(y_test, y_pred_log))

## AUROC Score

In [None]:
log_roc_auc_score = roc_auc_score(y_test, y_pred_log)

In [None]:
log_roc_auc_score

In [None]:
from sklearn import metrics
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, 
                             classification_report, f1_score, average_precision_score, precision_recall_fscore_support)

In [None]:
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, y_pred_log)
roc_auc_lr = auc(fpr_lr, tpr_lr)
precision_lr, recall_lr, th_lr = precision_recall_curve(y_test, y_pred_log)

## AUPRC Score

In [None]:
log_auprc_score=average_precision_score(y_test,y_pred_log)

In [None]:
log_auprc_score

In [None]:
# Ridge Classifier
rc = RidgeClassifier()
rc_fit = rc.fit(X_train, y_train)
y_pred_rc = rc_fit.predict(X_test)
rc_accuracy = accuracy_score(y_test, y_pred_rc)

In [None]:
rc_accuracy

In [None]:
print ("Accuracy: " + str(accuracy_score(y_pred_rc, y_test)))
print ("Precision: " + str(precision_score(y_pred_rc, y_test)))
print ("Recall: " + str(recall_score(y_pred_rc, y_test)))
print ("F1: " + str(f1_score(y_pred_rc, y_test)))

In [None]:
confusion_matrix(y_test, y_pred_rc)

In [None]:
print(classification_report(y_test, y_pred_rc))

## AUROC Score

In [None]:
rc_roc_auc_score = roc_auc_score(y_test, y_pred_rc)

In [None]:
rc_roc_auc_score

In [None]:
fpr_rc, tpr_rc, thresholds_rc = roc_curve(y_test, y_pred_rc)
roc_auc_rc = auc(fpr_rc, tpr_rc)
precision_rc, recall_rc, th_rc = precision_recall_curve(y_test, y_pred_rc)

## AUPRC Score

In [None]:
rc_auprc_score = average_precision_score(y_test,y_pred_rc)

In [None]:
rc_auprc_score

In [None]:
# SGD Classifier
sgd = SGDClassifier()
sgd_fit = sgd.fit(X_train, y_train)
y_pred_sgd = sgd_fit.predict(X_test)
sgd_accuracy = accuracy_score(y_test, y_pred_sgd)

In [None]:
sgd_accuracy

In [None]:
print ("Accuracy: " + str(accuracy_score(y_pred_sgd, y_test)))
print ("Precision: " + str(precision_score(y_pred_sgd, y_test)))
print ("Recall: " + str(recall_score(y_pred_sgd, y_test)))
print ("F1: " + str(f1_score(y_pred_sgd, y_test)))

In [None]:
confusion_matrix(y_test, y_pred_sgd)

In [None]:
print(classification_report(y_test, y_pred_sgd))

## AUROC Score

In [None]:
sgd_roc_auc_score = roc_auc_score(y_test, y_pred_sgd)

In [None]:
sgd_roc_auc_score

In [None]:
fpr_sgd, tpr_sgd, thresholds_sgd = roc_curve(y_test, y_pred_sgd)
roc_auc_sgd = auc(fpr_sgd, tpr_sgd)
precision_sgd, recall_sgd, th_sgd = precision_recall_curve(y_test, y_pred_sgd)

## AUPRC Score

In [None]:
sgd_auprc_score = average_precision_score(y_test,y_pred_sgd)

In [None]:
sgd_auprc_score

 Ensanmble Learning

In [None]:
# Ensanmble Learning
import statistics
final_pred = np.array([])
for i in range(0,len(X_test)):
    final_pred = np.append(final_pred, statistics.mode( [y_pred_log[i],y_pred_rc[i], y_pred_sgd[i]]))

In [None]:
import seaborn
import matplotlib.pyplot as plt
 
def plot_confusion_matrix(cm,
                          target_names,
                          title,
                          cmap=None,
                          normalize=True):
    
    
    data = cm
    labels = target_names
    
    """Plot confusion matrix using heatmap.
 
    Args:
        data (list of list): List of lists with confusion matrix data.
        labels (list): Labels which will be plotted across x and y axis.
        output_filename (str): Path to output file.
 
    """
    seaborn.set(color_codes=True)
    plt.figure(1, figsize=(9, 6))
 
    plt.title(title)
 
    seaborn.set(font_scale=1.4)
    ax = seaborn.heatmap(data, annot=True, cmap="YlGnBu", cbar_kws={'label': 'Scale'}, fmt=".5g")
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)
 
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    ax.set(ylabel="True Label", xlabel='Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
#     plt.savefig(output_filename, bbox_inches='tight', dpi=300)
    plt.show()
    plt.close()

In [None]:
ensamble_accuracy = accuracy_score(final_pred, y_test)

In [None]:
ensamble_accuracy

In [None]:
print ("Accuracy: " + str(accuracy_score(final_pred, y_test)))
print ("Precision: " + str(precision_score(final_pred, y_test)))
print ("Recall: " + str(recall_score(final_pred, y_test)))
print ("F1: " + str(f1_score(final_pred, y_test)))

In [None]:
fpr_en, tpr_en, thresholds_en = roc_curve(y_test, final_pred)
roc_auc_en = auc(fpr_en, tpr_en)
precision_en, recall_en, th_en = precision_recall_curve(y_test, final_pred)

In [None]:
roc_auc_en

In [None]:
plot_confusion_matrix(cm = confusion_matrix(y_test, final_pred, labels=[0,1]), 
                      normalize    = False,
                      target_names = [0,1],
                      title        = "Binary Classification")

In [None]:
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_lr, tpr_lr, label='Log Reg (area = %0.3f)' % roc_auc_lr)
plt.plot(fpr_rc, tpr_rc, label='Ridge Classifier (area = %0.3f)' % roc_auc_rc)
plt.plot(fpr_sgd, tpr_sgd, label='SGD (area = %0.3f)' % roc_auc_sgd)
plt.plot(fpr_en, tpr_en, label='Ensemble (area = %0.3f)' % roc_auc_en)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curves from the investigated models')
plt.legend(loc='best')
plt.show()

In [None]:
plt.plot([1, 0], [0, 1], 'k--')
plt.plot(recall_lr, precision_lr, label='Log Reg')
plt.plot(recall_rc, precision_rc, label='Ridge Classifier')
plt.plot(recall_sgd, precision_sgd, label='SGD')
plt.plot(recall_en, precision_en, label='Ensemble')
plt.title('Precision vs. Recall')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='best')
plt.show()

In [None]:
import numpy as np

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, scoring=None, obj_line=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    scoring : string, callable or None, optional, default: None
              A string (see model evaluation documentation)
              or a scorer callable object / function with signature scorer(estimator, X, y)
              For Python 3.5 the documentation is here:
              http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
              For example, Log Loss is specified as 'neg_log_loss'

    obj_line : numeric or None (default: None)
               draw a horizontal line


    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).


    Citation
    --------
        http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

    Usage
    -----
        plot_learning_curve(estimator = best_estimator,
                            title     = best_estimator_title,
                            X         = X_train,
                            y         = y_train,
                            ylim      = (-1.1, 0.1), # neg_log_loss is negative
                            cv        = StatifiedCV, # CV generator
                            scoring   = scoring,     # eg., 'neg_log_loss'
                            obj_line  = obj_line,    # horizontal line
                            n_jobs    = n_jobs)      # how many CPUs

         plt.show()
    """
    from sklearn.model_selection import learning_curve
    import numpy as np
    from matplotlib import pyplot as plt

    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    if obj_line:
        plt.axhline(y=obj_line, color='blue')

    plt.legend(loc="best")
    return plt

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold

In [None]:
X, y = X_train, y_train

estimator = LogisticRegression(solver = "liblinear")
plot_learning_curve(estimator = estimator,
                    title     = "Learning Curves (Log Regression)",
                    X         = X,
                    y         = y,
                    ylim      = (0.5, 1.1),
                    cv        = StratifiedKFold(),
                    scoring   = 'accuracy',     
                    obj_line  = 0.90,    
                    n_jobs    = -1)  
plt.show()


In [None]:
estimator = SGDClassifier()
plot_learning_curve(estimator = estimator,
                    title     = "Learning Curves (SGD)",
                    X         = X,
                    y         = y,
                    ylim      = (0.5, 1.1),
                    cv        = StratifiedKFold(),
                    scoring   = 'accuracy',     
                    obj_line  = 0.90,    
                    n_jobs    = -1)  
plt.show()


In [None]:
estimator = RidgeClassifier()
plot_learning_curve(estimator = estimator,
                    title     = "Learning Curves (Ridge classifier)",
                    X         = X,
                    y         = y,
                    ylim      = (0.5, 1.1),
                    cv        = StratifiedKFold(),
                    scoring   = 'accuracy',     
                    obj_line  = 0.90,    
                    n_jobs    = -1)  
plt.show()