In [None]:
from time import time
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv(r'/kaggle/input/network-intrusion-detection/Train_data.csv')
test_df = pd.read_csv(r'/kaggle/input/network-intrusion-detection/Test_data.csv')

In [None]:
print("Train dataset shape - ",train_df.shape)
print("Test dataset shape - ",test_df.shape)

In [None]:
pd.set_option('display.max_columns', None)
train_df.head()

In [None]:
test_df.head()

Notice that we dont have class column in test dataset, because we need to predict for the test dataset

In [None]:
train_df.info()

Notice that only protocol_type, service, flag and class are object type and the rest columns are numerical type

Exploratory analysis

In [None]:
t0 = time()
print(train_df.groupby('protocol_type')['protocol_type'].count())
time() - t0

In [None]:
t0 = time()
pd.set_option('display.max_row', None)
print(train_df.groupby('class')['class'].count())
time() - t0

In [None]:
t0 = time()
print(train_df.groupby('flag')['flag'].count())
time() - t0

# Statistical based feature selection

In [None]:
col_names = train_df.columns

In [None]:
type(col_names)

In [None]:
num_cols = col_names.drop(['protocol_type', 'flag', 'service'])

In [None]:
corr_df = train_df[num_cols].corr()
sns.heatmap(corr_df)

From the heatmap, we can notice that correlation for num_outbounds_cmds with other features is constant

In [None]:
train_df['num_outbound_cmds'].unique()

num_outbound_cmds coulmn has 0 values for all records, so we can drop this column

In [None]:
train_df.drop('num_outbound_cmds', axis = 1, inplace = True)

Find the feature that are highly correlated and drop one feature from highly correlated feature

In [None]:
highly_correlated_df = (corr_df.abs() > 0.9) & (corr_df.abs() < 1.0) 

In [None]:
corr_var_index = (highly_correlated_df == True).any()
corr_var_names = corr_var_index[corr_var_index == True].index

de_duplicate = []
corr_pairs = []

for i in corr_var_index.index:
    row = highly_correlated_df[i]
    de_duplicate.append(i)
    for j in corr_var_names:
        if j not in de_duplicate and row[j] == True:
            print(i,j,": ", corr_df.loc[i,j])
            corr_pairs.append((i,j))



In [None]:
train_df.drop(['num_root', 'srv_serror_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
              'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'], axis = 1, inplace = True)

Perform Chi-Square test to check whether categorical features depend on the output value or not. Before that we need to encode the categorical features

In [None]:
train_df.info()

In [None]:
categorical_columns = ['protocol_type', 'service', 'flag']

In [None]:
train_df[categorical_columns].head()

In [None]:
label_encoder = preprocessing.LabelEncoder()
train_df['protocol_type'] = label_encoder.fit_transform(train_df['protocol_type'])
train_df['service'] = label_encoder.fit_transform(train_df['service'])
train_df['flag'] = label_encoder.fit_transform(train_df['flag'])
train_df['class'] = label_encoder.fit_transform(train_df['class'])

In [None]:
train_df[categorical_columns].head()

To calculate chi square test we need to contigency table. At first we consider protocol_type and class. Let's take the significant value be 0.05

In [None]:
#p value is 0.0 which is less than significant value. Hence service and class are not independent
chi2_contingency(pd.crosstab(train_df['service'], train_df['class']))

In [None]:
#P value is 0.0 which is less than significant value. Hence flag and class are not independent
chi2_contingency(pd.crosstab(train_df['flag'], train_df['class']))

In [None]:
#p value is 0.0 which is less than significant value. Hence service and class features are not independent.
chi2_contingency(pd.crosstab(train_df['service'], train_df['class']))

# Split data for training and validation of model

Split the data into 70:30.

In [None]:
Y = train_df['class']
train_df.drop('class', axis=1, inplace = True)

In [None]:

X_train, X_valid, Y_train, Y_valid = model_selection.train_test_split(train_df, Y, test_size = 0.3)

# Training the model

In [None]:
model = LogisticRegression()
model = model.fit(X_train, Y_train)

In [None]:
pred = model.predict(X_valid)

In [None]:
metrics.confusion_matrix(Y_valid, pred)

In [None]:
metrics.f1_score(Y_valid, pred)

# Predicting for test data

Test data should pass through the same preprocessing steps as training data before prediction

In [None]:
test_df.drop('num_outbound_cmds', axis = 1, inplace = True)
test_df.drop(['num_root', 'srv_serror_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
              'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'], axis = 1, inplace = True)
test_df['protocol_type'] = label_encoder.fit_transform(test_df['protocol_type'])
test_df['service'] = label_encoder.fit_transform(test_df['service'])
test_df['flag'] = label_encoder.fit_transform(test_df['flag'])



In [None]:
test_df.head()

In [None]:
model.predict(test_df)

We reached to the end by predicting the network intrusion type for the test data