In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
import numpy as np
import pandas as pd

traffic_df = pd.read_csv('/content/gdrive/MyDrive/CIC-Darknet/darknet.csv')
traffic_df.shape

(141530, 85)

### Drop Columns

In [3]:
drop_columns = ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp', 'Label']
traffic_df.rename(columns={'Label.1':'target'}, inplace=True)
traffic_df.drop(drop_columns, axis=1, inplace=True)

In [4]:
traffic_df.head()

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,target
0,57158,443,6,229,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,benign_Audio-Streaming
1,57159,443,6,407,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,benign_Audio-Streaming
2,57160,443,6,431,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,benign_Audio-Streaming
3,49134,443,6,359,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,benign_Audio-Streaming
4,34697,19305,6,10778451,591,400,64530,6659,131,0,...,20,0,0,0,0,1437760000000000.0,3117718.131,1437760000000000.0,1437760000000000.0,benign_Audio-Streaming


### Reduce Memory Usage

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [6]:
traffic_df = reduce_mem_usage(traffic_df)

Memory usage after optimization is: 31.85 MB
Decreased by 63.1%


### Infinity and Null Value Replace 0

In [7]:
traffic_df = traffic_df.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [8]:
columns = traffic_df.columns

for column in columns:
    print(pd.isnull(traffic_df[column]).sum())

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


### Label Encoding

In [9]:
traffic_df['target'].value_counts()

benign_P2P                48300
benign_Browsing           32545
dark_Audio-Streaming      13284
benign_File-Transfer       8563
benign_Video-Streaming     8402
benign_Chat                6932
benign_Email               5561
benign_Audio-Streaming     4766
dark_Chat                  4541
dark_File-Transfer         2610
benign_VOIP                2101
dark_VOIP                  1465
dark_Video-Streaming       1346
dark_Email                  582
dark_Browsing               263
dark_P2P                    220
Name: target, dtype: int64

In [10]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

traffic_df['target'] = encoder.fit_transform(traffic_df['target'])
traffic_df['target'].value_counts()

5     48300
1     32545
8     13284
4      8563
7      8402
2      6932
3      5561
0      4766
10     4541
12     2610
6      2101
14     1465
15     1346
11      582
9       263
13      220
Name: target, dtype: int64

In [11]:
traffic_df.head()

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,target
0,57158,443,6,229,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
1,57159,443,6,407,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
2,57160,443,6,431,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
3,49134,443,6,359,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
4,34697,19305,6,10778451,591,400,64530,6659,131,0,...,20,0,0,0,0,1437760000000000.0,3117718.25,1437760000000000.0,1437760000000000.0,0


### Normalization (Log Transform)

In [12]:
columns = []

columns += list(traffic_df.select_dtypes(['float16']).columns)
columns += list(traffic_df.select_dtypes(['float32']).columns)
columns += list(traffic_df.select_dtypes(['float64']).columns)
print(columns)

['Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Packet Length Mean', 'Packet Length Std', 'Average Packet Size', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg', 'Flow IAT Mean', 'Flow IAT Std', 'Fwd IAT Mean', 'Fwd IAT Std', 'Bwd IAT Mean', 'Bwd IAT Std', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Variance', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'Flow Bytes/s', 'Flow Packets/s']


In [13]:
for column in columns:
    if column == 'target':
        continue
    traffic_df[column] = np.log1p(traffic_df[column])

traffic_df.head()

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,target
0,57158,443,6,229,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
1,57159,443,6,407,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
2,57160,443,6,431,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
3,49134,443,6,359,1,1,0,0,0,0,...,20,0,0,0,0,0.0,0.0,0.0,0.0,0
4,34697,19305,6,10778451,591,400,64530,6659,131,0,...,20,0,0,0,0,34.901863,14.952612,34.901863,34.901863,0


In [14]:
traffic_df.to_csv('/content/gdrive/MyDrive/CIC-Darknet/darknet_log.csv', index=False)