In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r"UNSW_2018_IoT_Botnet_V6.csv", sep=",", low_memory=False)

### Dataset Information

In [3]:
df.sample(10)

Unnamed: 0,pkts,bytes,dur,spkts,dpkts,sbytes,dbytes,rate,srate,drate,category
817510,8,480,24.863705,8,0,480,0,0.281535,0.281535,0.0,DoS-UDP
1464091,7,890,14.632784,6,1,830,60,0.410038,0.341698,0.0,DDoS-TCP
38318,7,420,25.991417,7,0,420,0,0.230845,0.230845,0.0,DoS-UDP
2900085,10,600,13.547964,10,0,600,0,0.664306,0.664306,0.0,DDoS-UDP
684485,16,960,32.477962,16,0,960,0,0.461852,0.461852,0.0,DoS-UDP
220398,11,660,23.753277,11,0,660,0,0.420995,0.420995,0.0,DoS-UDP
3507441,9,822,49.660049,6,3,642,180,0.161095,0.100685,0.040274,DoS-TCP
2493404,5,300,12.163426,5,0,300,0,0.328855,0.328855,0.0,DDoS-UDP
1539909,6,736,11.918358,5,1,676,60,0.419521,0.335617,0.0,DDoS-TCP
1728931,6,736,10.625074,5,1,676,60,0.470585,0.376468,0.0,DDoS-TCP


In [4]:
df.shape

(3577361, 11)

In [5]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]
print(Counter(y))

Counter({'DoS-UDP': 1032975, 'DDoS-TCP': 977380, 'DDoS-UDP': 948255, 'DoS-TCP': 615800, 'DoS-HTTP': 1485, 'DDoS-HTTP': 989, 'Normal': 477})


### RandomOverSampler 

In [6]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
print(Counter(y_over))

Counter({'DoS-UDP': 1032975, 'Normal': 1032975, 'DDoS-TCP': 977380, 'DDoS-UDP': 948255, 'DoS-TCP': 615800, 'DoS-HTTP': 1485, 'DDoS-HTTP': 989})


### Spliting 

In [7]:
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over, test_size=0.3, random_state=0)
print(Counter(y_train_over))
print(Counter(y_test_over))

Counter({'DoS-UDP': 723670, 'Normal': 722441, 'DDoS-TCP': 684217, 'DDoS-UDP': 663743, 'DoS-TCP': 431100, 'DoS-HTTP': 1036, 'DDoS-HTTP': 694})
Counter({'Normal': 310534, 'DoS-UDP': 309305, 'DDoS-TCP': 293163, 'DDoS-UDP': 284512, 'DoS-TCP': 184700, 'DoS-HTTP': 449, 'DDoS-HTTP': 295})


In [8]:
X_train_over["class"] = y_train_over
X_test_over["class"] = y_test_over

### Information spliting result

In [9]:
X_train_over.head()

Unnamed: 0,pkts,bytes,dur,spkts,dpkts,sbytes,dbytes,rate,srate,drate,class
1524905,5,770,13.373815,5,0,770,0,0.299092,0.299092,0.0,DDoS-TCP
1366872,5,770,15.905256,5,0,770,0,0.251489,0.251489,0.0,DDoS-TCP
2187233,9,540,12.423796,9,0,540,0,0.643926,0.643926,0.0,DDoS-UDP
3922742,2,120,5.5e-05,1,1,60,60,18181.818359,0.0,0.0,Normal
3220387,3,274,0.143805,2,1,214,60,13.907722,6.953861,0.0,DoS-TCP


In [10]:
y_train_over.head()

1524905    DDoS-TCP
1366872    DDoS-TCP
2187233    DDoS-UDP
3922742      Normal
3220387     DoS-TCP
Name: category, dtype: object

In [11]:
X_test_over.head()

Unnamed: 0,pkts,bytes,dur,spkts,dpkts,sbytes,dbytes,rate,srate,drate,class
1679885,6,736,10.750988,5,1,676,60,0.465074,0.372059,0.0,DDoS-TCP
3021811,4,616,28.765491,4,0,616,0,0.104292,0.104292,0.0,DoS-TCP
2867947,14,840,14.276105,14,0,840,0,0.910613,0.910613,0.0,DDoS-UDP
2234374,6,360,10.869819,6,0,360,0,0.459989,0.459989,0.0,DDoS-UDP
919151,7,420,24.315582,7,0,420,0,0.246755,0.246755,0.0,DoS-UDP


In [12]:
y_test_over.head()

1679885    DDoS-TCP
3021811     DoS-TCP
2867947    DDoS-UDP
2234374    DDoS-UDP
919151      DoS-UDP
Name: category, dtype: object

### Export Data to CSV 

In [13]:
X_train_over.to_csv("UNSW_2018_IoT_Botnet_V6_Train.csv", index=False)
X_test_over.to_csv("UNSW_2018_IoT_Botnet_V6_Test.csv", index=False)