In [None]:
# importing libraries
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

t1 = time.time()

In [None]:
path = '../input/ip-network-traffic-flows-labeled-with-87-apps/Dataset-Unicauca-Version2-87Atts.csv'

dataset = pd.read_csv(path)

dataset

In [None]:
dataset.shape

In [None]:
# Histogram on Source.IP
Sour_feat = pd.DataFrame(dataset['Source.IP'].value_counts()[:30])
plt.figure(figsize=(20,10))
plt.plot(Sour_feat)
plt.xticks(rotation=90)
plt.xlabel('Source.IP', {'fontsize':15})
plt.ylabel('Counts', {'fontsize':15})
plt.title('Top 30 Counts in Source.IP\n', {'fontsize':20})
plt.grid()
plt.savefig('hist Source.IP.png')
Sour_feat = Sour_feat.reset_index()['index'].values

In [None]:
# Histogram on Destination.IP
Dest_feat = pd.DataFrame(dataset['Destination.IP'].value_counts()[:30])
plt.figure(figsize=(20,10))
plt.plot(Dest_feat)
plt.xticks(rotation=90)
plt.xlabel('Destination.IP', {'fontsize':15})
plt.ylabel('Counts', {'fontsize':15})
plt.title('Top 30 Counts in Destination.IP\n', {'fontsize':20})
plt.grid()
plt.savefig('hist Destination.IP.png')
Dest_feat = Dest_feat.reset_index()['index'].values

In [None]:
Dest_feat

In [None]:
Sour_feat

In [None]:
# Filtering the dataset to contain only 30 frequently reported IP address in Source.IP and Destination.IP
f_dataset = dataset[dataset['Destination.IP'].isin(Dest_feat) & dataset['Source.IP'].isin(Sour_feat)].reset_index()
f_dataset = f_dataset.drop('index', axis=1)

In [None]:
# making dummies
dum_s = pd.get_dummies(f_dataset['Source.IP'])

dum_d = pd.get_dummies(f_dataset['Destination.IP'])

label = pd.get_dummies(f_dataset['ProtocolName'])

In [None]:
dum_s.shape

In [None]:
dum_d.shape

In [None]:
label.shape

In [None]:
f_dataset.columns

In [None]:
# removing columns
f_dataset = f_dataset.drop(f_dataset.select_dtypes(include = ['object']).columns, axis = 1)
f_dataset = f_dataset.drop(['Source.Port','Destination.Port','L7Protocol','Protocol'], axis = 1)
f_dataset.columns

In [None]:
f_dataset.shape

In [None]:
p_dataset = pd.concat([f_dataset, dum_s, dum_d], axis=1)

In [None]:
# normalizing the data
scaler = MinMaxScaler()
n_dataset = scaler.fit_transform(p_dataset)

In [None]:
n_dataset.shape

In [None]:
# spliting the dataset
X = n_dataset
y = label

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.4, random_state = 101)

print(X_train.shape,'\n',y_train.shape,'\n',X_test.shape,'\n',y_test.shape,'\n')

In [None]:
# defining the model
model = tf.keras.Sequential(
    layers=[tf.keras.layers.Dense(100, input_shape=[X.shape[1]]),
            tf.keras.layers.Dense(100, activation='tanh'),
            tf.keras.layers.Dense(100, activation='tanh'),
            tf.keras.layers.Dense(100, activation='tanh'),
            tf.keras.layers.Dense(y.shape[1], activation='softmax')])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy',tf.keras.metrics.Precision(),tf.keras.metrics.Recall()])

model.summary()

In [None]:
# training the model
t2 = time.time()
history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, verbose=0)
t3 = time.time()

In [None]:
# training figures
plt.figure(figsize=(10,10))

plt.subplot(411)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.xlabel('epochs')
plt.ylabel('loss')
plt.grid()

plt.subplot(412)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend(['accuracy', 'val_accuracy'])
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.grid()

plt.subplot(413)
plt.plot(history.history['precision'])
plt.plot(history.history['val_precision'])
plt.legend(['precision', 'val_precision'])
plt.xlabel('epochs')
plt.ylabel('precision')
plt.grid()

plt.subplot(414)
plt.plot(history.history['recall'])
plt.plot(history.history['val_recall'])
plt.legend(['recall', 'val_recall'])
plt.xlabel('epochs')
plt.ylabel('recall')
plt.grid()

plt.savefig('training.png')

In [None]:
# prediction
loss, accuracy, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print('loss:      {}\naccuracy:  {}\nprecision: {}\nrecall:    {}\n'.format(loss, accuracy, precision, recall))

In [None]:
# time taken
t4 = time.time()

print('run time            = {} sec'.format(int(t4 - t1)))
print('training time       = {} sec'.format(int(t3 - t2)))
print('pre-processing time = {} sec'.format(int(t2 - t1)))

In [None]:
y_true = y_test.idxmax(
    axis='columns'
).reset_index().drop('index', axis=1).rename(columns={'0':'ProtocolName'})

In [None]:
y_pred = pd.DataFrame(pd.DataFrame(model.predict(X_test),
                                   columns = y_test.columns)
                      .idxmax(axis='columns'), columns=['ProtocolName'])

In [None]:
conf_mat = pd.DataFrame(confusion_matrix(y_true, y_pred,
                                         labels = label.columns),
                        columns = label.columns,
                        index = label.columns)

In [None]:
plt.figure(figsize=(24,20))

sns.heatmap(conf_mat,
            cmap = 'gray',
            linecolor = 'white',
            linewidths = 0.01,
            annot=True)

plt.title("confusion matrix", {'fontsize':35})
plt.xlabel('y_pred', {'fontsize':20})
plt.ylabel('y_true', {'fontsize':20})
plt.savefig('confusion matrix.png')

In [None]:
print(classification_report(y_true, y_pred))