In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r"UNSW_2018_IoT_Botnet_V6.csv", sep=",", low_memory=False)

### Dataset Information

In [3]:
df.sample(10)

Unnamed: 0,pkts,bytes,dur,spkts,dpkts,sbytes,dbytes,rate,srate,drate,category
334280,6,360,20.584076,6,0,360,0,0.242906,0.242906,0.0,DoS-UDP
1145262,1,154,0.0,1,0,154,0,0.0,0.0,0.0,DDoS-TCP
606496,16,960,32.574299,16,0,960,0,0.460486,0.460486,0.0,DoS-UDP
2287952,16,960,14.641023,16,0,960,0,1.024519,1.024519,0.0,DDoS-UDP
395520,8,480,25.002214,8,0,480,0,0.279975,0.279975,0.0,DoS-UDP
271751,9,540,28.541332,9,0,540,0,0.280295,0.280295,0.0,DoS-UDP
1938026,4,616,44.408943,4,0,616,0,0.067554,0.067554,0.0,DDoS-TCP
3509177,4,616,58.963268,4,0,616,0,0.050879,0.050879,0.0,DoS-TCP
644642,8,480,33.113377,8,0,480,0,0.211395,0.211395,0.0,DoS-UDP
288749,9,540,28.537472,9,0,540,0,0.280333,0.280333,0.0,DoS-UDP


In [4]:
df.shape

(3577361, 11)

In [5]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]
print(Counter(y))

Counter({'DoS-UDP': 1032975, 'DDoS-TCP': 977380, 'DDoS-UDP': 948255, 'DoS-TCP': 615800, 'DoS-HTTP': 1485, 'DDoS-HTTP': 989, 'Normal': 477})


### RandomOverSampler 

In [6]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
print(Counter(y_over))

Counter({'DoS-UDP': 1032975, 'Normal': 1032975, 'DDoS-TCP': 977380, 'DDoS-UDP': 948255, 'DoS-TCP': 615800, 'DoS-HTTP': 1485, 'DDoS-HTTP': 989})


### Spliting 

In [7]:
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over, test_size=0.3, random_state=0)
print(Counter(y_train_over))
print(Counter(y_test_over))

Counter({'DoS-UDP': 723670, 'Normal': 722441, 'DDoS-TCP': 684217, 'DDoS-UDP': 663743, 'DoS-TCP': 431100, 'DoS-HTTP': 1036, 'DDoS-HTTP': 694})
Counter({'Normal': 310534, 'DoS-UDP': 309305, 'DDoS-TCP': 293163, 'DDoS-UDP': 284512, 'DoS-TCP': 184700, 'DoS-HTTP': 449, 'DDoS-HTTP': 295})


In [8]:
X_train_over["class"] = y_train_over
X_test_over["class"] = y_test_over

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Information spliting result

In [9]:
X_train_over.head()

Unnamed: 0,pkts,bytes,dur,spkts,dpkts,sbytes,dbytes,rate,srate,drate,class
1524905,5,770,13.373815,5,0,770,0,0.299092,0.299092,0.0,DDoS-TCP
1366872,5,770,15.905256,5,0,770,0,0.251489,0.251489,0.0,DDoS-TCP
2187233,9,540,12.423796,9,0,540,0,0.643926,0.643926,0.0,DDoS-UDP
3922742,2,172,0.006982,1,1,86,86,143.225433,0.0,0.0,Normal
3220387,3,274,0.143805,2,1,214,60,13.907722,6.953861,0.0,DoS-TCP


In [10]:
X_train_over.groupby('class').agg({'class':'first', 'class':'count'}).rename_axis(None).sort_values('class', ascending=False)

Unnamed: 0,class
DoS-UDP,723670
Normal,722441
DDoS-TCP,684217
DDoS-UDP,663743
DoS-TCP,431100
DoS-HTTP,1036
DDoS-HTTP,694


In [11]:
X_test_over.head()

Unnamed: 0,pkts,bytes,dur,spkts,dpkts,sbytes,dbytes,rate,srate,drate,class
1679885,6,736,10.750988,5,1,676,60,0.465074,0.372059,0.0,DDoS-TCP
3021811,4,616,28.765491,4,0,616,0,0.104292,0.104292,0.0,DoS-TCP
2867947,14,840,14.276105,14,0,840,0,0.910613,0.910613,0.0,DDoS-UDP
2234374,6,360,10.869819,6,0,360,0,0.459989,0.459989,0.0,DDoS-UDP
919151,7,420,24.315582,7,0,420,0,0.246755,0.246755,0.0,DoS-UDP


In [12]:
X_test_over.groupby('class').agg({'class':'first', 'class':'count'}).rename_axis(None).sort_values('class', ascending=False)

Unnamed: 0,class
Normal,310534
DoS-UDP,309305
DDoS-TCP,293163
DDoS-UDP,284512
DoS-TCP,184700
DoS-HTTP,449
DDoS-HTTP,295


### Export Data to CSV 

In [13]:
X_train_over.to_csv("UNSW_2018_IoT_Botnet_V6_Train.csv", index=False)
X_test_over.to_csv("UNSW_2018_IoT_Botnet_V6_Test.csv", index=False)