In [None]:
from google.colab import drive
import os
import sys

import numpy as np
import pandas as pd
from argparse import ArgumentParser
import math
import pickle

# Keras imports
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping

# Sklearn imports
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import recall_score, accuracy_score, precision_score, confusion_matrix

In [None]:
drive.mount('/content/gdrive/')
prefix = '/content/gdrive/My Drive/'

customized_path = 'Network Security/' #this is the part that contains the data for the homework. This is mine. Yours might be different
sys_path = prefix + customized_path
sys.path.append(sys_path)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
# Read in raw data from .csv files

mirai = True
# device_name = 'Ecobee_Thermostat/'
# device_name = 'SimpleHome_XCS7_1003_WHT_Security_Camera/'
# device_name = 'SimpleHome_XCS7_1002_WHT_Security_Camera/'
# device_name = 'Danmini_Doorbell/'
device_name = 'Philips_B120N10_Baby_Monitor/'

# mirai = False
# device_name = 'Samsung_SNH_1011_N_Webcam/'


path_to_data = sys_path + 'datasets/baiot/' + device_name

benign_data = pd.read_csv(path_to_data + 'benign_traffic.csv')
# benign_cleaned = benign_data.dropna().reset_index(drop=True)

gafgyt_u_data = pd.read_csv(path_to_data + 'gafgyt_attacks/udp.csv')
gafgyt_t_data = pd.read_csv(path_to_data + 'gafgyt_attacks/tcp.csv')
gafgyt_s_data = pd.read_csv(path_to_data + 'gafgyt_attacks/scan.csv')
gafgyt_j_data = pd.read_csv(path_to_data + 'gafgyt_attacks/junk.csv')
gafgyt_c_data = pd.read_csv(path_to_data + 'gafgyt_attacks/combo.csv')

if mirai:
  mirai_up_data = pd.read_csv(path_to_data + 'mirai_attacks/udpplain.csv')
  mirai_u_data = pd.read_csv(path_to_data + 'mirai_attacks/udp.csv')
  mirai_sy_data = pd.read_csv(path_to_data + 'mirai_attacks/syn.csv')
  mirai_sc_data = pd.read_csv(path_to_data + 'mirai_attacks/scan.csv')
  mirai_a_data = pd.read_csv(path_to_data + 'mirai_attacks/ack.csv')



In [None]:
# Add targets to data

benign_data['target'] = 'benign'

gafgyt_u_data['target'] = 'gafgyt_udp'
gafgyt_t_data['target'] = 'gafgyt_tcp'
gafgyt_s_data['target'] = 'gafgyt_scan'
gafgyt_j_data['target'] = 'gafgyt_junk'
gafgyt_c_data['target'] = 'gafgyt_combo'

if mirai:
  mirai_up_data['target'] = 'mirai_udpplain'
  mirai_u_data['target'] = 'mirai_udp'
  mirai_sy_data['target'] = 'mirai_syn'
  mirai_sc_data['target'] = 'mirai_scan'
  mirai_a_data['target'] = 'mirai_ack'

In [None]:
# Create train dataset of only benign data

x = len(benign_data)*2//3

benign_train = benign_data.iloc[:x]
benign_test = benign_data.iloc[x:].reset_index(drop=True)

# Create test dataset of benign and malicious data

gafgyt_data = pd.concat([gafgyt_u_data, gafgyt_t_data, gafgyt_s_data, gafgyt_j_data, gafgyt_c_data], axis=0, ignore_index=True)

if mirai:
  mirai_data = pd.concat([mirai_up_data, mirai_u_data, mirai_sy_data, mirai_sc_data, mirai_a_data], axis=0, ignore_index=True)
  test_data = pd.concat([benign_test, gafgyt_data, mirai_data], axis=0, ignore_index=True)
else:
  test_data = pd.concat([benign_test, gafgyt_data], axis=0, ignore_index=True)

# data = pd.concat([benign_data, gafgyt_data, mirai_data], axis=0, ignore_index=True)
# data_cleaned = data.dropna().reset_index(drop=True)

# data.tail()
test_data.tail()


Unnamed: 0,MI_dir_L5_weight,MI_dir_L5_mean,MI_dir_L5_variance,MI_dir_L3_weight,MI_dir_L3_mean,MI_dir_L3_variance,MI_dir_L1_weight,MI_dir_L1_mean,MI_dir_L1_variance,MI_dir_L0.1_weight,...,HpHp_L0.1_covariance,HpHp_L0.1_pcc,HpHp_L0.01_weight,HpHp_L0.01_mean,HpHp_L0.01_std,HpHp_L0.01_magnitude,HpHp_L0.01_radius,HpHp_L0.01_covariance,HpHp_L0.01_pcc,target
981846,126.500331,419.602848,52644.832789,210.361594,418.283351,52924.416073,624.106232,405.922724,55374.288743,6321.382459,...,0.0,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,mirai_ack
981847,126.562977,416.761552,53242.540266,210.424949,416.580685,53280.041711,624.178566,405.368519,55476.978155,6321.442225,...,0.0,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,mirai_ack
981848,127.336871,413.959838,53816.111144,211.199313,414.892324,53626.954327,624.955386,404.81589,55578.763941,6322.216161,...,0.0,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,mirai_ack
981849,127.199003,411.177113,54370.254502,211.064929,413.210887,53966.77809,624.834466,404.264038,55679.797015,6322.081292,...,0.0,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,mirai_ack
981850,128.198057,408.43778,54900.630083,212.063987,411.545301,54297.823688,625.833537,403.71395,55779.901199,6323.080352,...,0.0,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,mirai_ack


In [None]:
# Separate train into train and val data
train_features = benign_train[benign_train.columns[:-1]]
train_target = benign_train['target']

X_train, X_val, y_train, y_val = train_test_split(train_features,
                                                    train_target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=0)

X_test = test_data[test_data.columns[:-1]]
y_test = test_data['target']

# y_train_round = y_train.astype(int)
# y_test_round = y_test.astype(int)

# t = MinMaxScaler()
# X_train = t.fit_transform(X_train)
# X_val = t.transform(X_val)
# X_test = t.transform(X_test)

# X_test.tail()
# print(X_train)

In [None]:
# import pickle

X_train.to_pickle(sys_path + 'pkl_files/' + device_name + 'X_train.pkl')
X_val.to_pickle(sys_path + 'pkl_files/' + device_name + 'X_val.pkl')
X_test.to_pickle(sys_path + 'pkl_files/' + device_name + 'X_test.pkl')

y_train.to_pickle(sys_path + 'pkl_files/' + device_name + 'y_train.pkl')
y_val.to_pickle(sys_path + 'pkl_files/' + device_name + 'y_val.pkl')
y_test.to_pickle(sys_path + 'pkl_files/' + device_name + 'y_test.pkl')

In [None]:
#For the hybrid network
if mirai:
  malicious_df = pd.concat([gafgyt_data, mirai_data], axis=0, ignore_index=True)
else:
  malicious_df = pd.concat([gafgyt_data], axis=0, ignore_index=True)

malicious_df['target'] = 'malicious'

print(benign_data.shape, malicious_df.shape)
all_data = pd.concat([benign_data, malicious_df], axis=0, ignore_index=True)
train_features = all_data[all_data.columns[:-1]]
train_target = all_data['target']

X_train, X_test, y_train, y_test = train_test_split(train_features,
                                                    train_target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=0)

X_test = test_data[test_data.columns[:-1]]
y_test = test_data['target']
X_train.to_pickle(sys_path + 'pkl_files/hybrid/' + device_name + 'X_train.pkl')
X_test.to_pickle(sys_path + 'pkl_files/hybrid/' + device_name + 'X_test.pkl')

y_train.to_pickle(sys_path + 'pkl_files/hybrid/' + device_name + 'y_train.pkl')
y_test.to_pickle(sys_path + 'pkl_files/hybrid/' + device_name + 'y_test.pkl')

(175240, 116) (923437, 116)


This is to combine all the devices into one

# MedbIoT

In [None]:
med_path = '/content/gdrive/MyDrive/Network Security/datasets/medbiot/'


#benign_data = pd.read_pickle(med_path + 'benign_data.pkl') #pd.concat([mirai_legit, torii_legit], axis=0, ignore_index=True)
#malicious_data = pd.read_pickle(med_path + 'malicious_data.pkl')
benign_data = pd.read_csv(med_path + 'mirai_leg.csv')
malicious_data = pd.concat([pd.read_csv(med_path + 'mirai_mal_CC_all.csv'), pd.read_csv(med_path + 'mirai_mal_spread_all.csv')], axis=0, ignore_index=True)


x = len(benign_data)*2//3

benign_train = benign_data.iloc[:x]
benign_test = benign_data.iloc[x:].reset_index(drop=True)
# data = pd.concat([benign_data, gafgyt_data, mirai_data], axis=0, ignore_index=True)
# data_cleaned = data.dropna().reset_index(drop=True)

# data.tail()
test_data = pd.concat([benign_test, malicious_data], axis=0, ignore_index=True)
test_data.tail()

train_features = benign_train[benign_train.columns[:-1]]
train_target = benign_train['target']

X_train, X_val, y_train, y_val = train_test_split(train_features,
                                                    train_target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

X_test = test_data[test_data.columns[:-1]]
y_test = test_data['target']

pkl_path = '/content/gdrive/MyDrive/Network Security/pkl_files/medbiot/mirai'
X_train.to_pickle(pkl_path + 'X_train.pkl')
X_val.to_pickle(pkl_path + 'X_val.pkl')
X_test.to_pickle(pkl_path + 'X_test.pkl')

y_train.to_pickle(pkl_path + 'y_train.pkl')
y_val.to_pickle(pkl_path + 'y_val.pkl')
y_test.to_pickle(pkl_path + 'y_test.pkl')



In [None]:
#For the hybrid network

all_data = pd.concat([benign_data, malicious_data], axis=0, ignore_index=True)
train_features = all_data[all_data.columns[:-1]]
train_target = all_data['target']

X_train, X_test, y_train, y_test = train_test_split(train_features,
                                                    train_target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=41)

pkl_path = '/content/gdrive/MyDrive/Network Security/pkl_files/medbiot/mirai/hybrid/'
# X_test = test_data[test_data.columns[:-1]]
# y_test = test_data['target']
X_train.to_pickle(pkl_path + 'X_train.pkl')
X_test.to_pickle(pkl_path + 'X_test.pkl')

y_train.to_pickle(pkl_path + 'y_train.pkl')
y_test.to_pickle(pkl_path + 'y_test.pkl')