In [None]:
import pandas as pd
import numpy as np
import datetime
import h5py
from scipy.stats import mode
from scipy.sparse import csc_matrix
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn import model_selection, feature_selection, utils, ensemble, linear_model, metrics

window_width = 120 # seconds
window_stride = 60 # seconds
print(pd.__version__)

data = pd.read_csv("CTU-13-Dataset/1/capture20110810.binetflow")

def normalize_column(dt, column):
    mean = dt[column].mean()
    std = dt[column].std()
    print(mean, std)

    dt[column] = (dt[column]-mean) / std

data['StartTime'] = pd.to_datetime(data['StartTime']).astype(np.int64)*1e-9
datetime_start = data['StartTime'].min()

data['Window_lower'] = (data['StartTime']-datetime_start-window_width)/window_stride+1
data['Window_lower'].clip(lower=0, inplace=True)
data['Window_upper_excl'] = (data['StartTime']-datetime_start)/window_stride+1
data = data.astype({"Window_lower": int, "Window_upper_excl": int})
data.drop('StartTime', axis=1, inplace=True)

unique_lower = data['Window_lower'].max()
unique_upper = data['Window_upper_excl'].max()

data['Label'], labels = pd.factorize(data['Label'].str.slice(0, 15))

unique_labels = data['Label'].unique()

Xi = pd.DataFrame() #membuat data frame kosong
Xo = pd.DataFrame()
nb_windows = data['Window_upper_excl'].max()

for i in range(0, nb_windows):
    gb = data.loc[(data['Window_lower'] <= i) & (data['Window_upper_excl'] > i)].groupby('SrcAddr')
    Xi = pd.concat([Xi,gb.agg({'Sport':'nunique',
                                                       'DstAddr':'nunique',
                                                       'Dport':'nunique',
                                                       'Dur':['sum', 'mean', 'std', 'max', 'median'],
                                                       'TotBytes':['sum', 'mean', 'std', 'max', 'median'],
                                                       'SrcBytes':['sum', 'mean', 'std', 'max', 'median'],
                                                       'Label':lambda x: mode(x)[0]}).reset_index().assign(window_id=i)])
    Xo=pd.concat([Xo,gb.size().to_frame(name='counts')])

Xi.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in Xi.columns.values]
Xi = Xi.drop(columns=['SrcAddr_'])

Xo = Xo.reset_index()
Xi = Xi.reset_index()
result = pd.concat([Xo.reset_index(drop=True), Xi.reset_index(drop=True)], axis=1)

result = result.drop(columns=['index'])
result.fillna(-1, inplace=True)

columns_to_normalize = list(result.columns.values)
columns_to_normalize.remove('SrcAddr')
columns_to_normalize.remove('Label_<lambda>')
columns_to_normalize.remove('window_id_')

normalize_column(result, columns_to_normalize)

result.drop('SrcAddr', axis=1).to_hdf('input/1_data_window_botnet3.h5', key="data", mode="w")
np.save("ip_addr/1_data_window_botnet1_id.npy", result['SrcAddr'])
np.save("label/1_data_window_botnet1_labels.npy", labels)

def RU(df):
    if df.shape[0] == 1:
        return 1.0
    else:
        proba = df.value_counts()/df.shape[0]
        h = proba*np.log10(proba)
        return -h.sum()/np.log10(df.shape[0])

X = pd.DataFrame()
nb_windows = data['Window_upper_excl'].max()

for i in range(0, nb_windows):
    gc = data.loc[(data['Window_lower'] <= i) & (data['Window_upper_excl'] > i)].groupby('SrcAddr')
    X = pd.concat([X,gc.agg({'Sport':[RU],
                         'DstAddr':[RU],
                         'Dport':[RU]}).reset_index()])
    
X.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in X.columns.values]
columns_to_normalize = list(X.columns.values)
columns_to_normalize.remove('SrcAddr_')

normalize_column(X, columns_to_normalize)

X.drop('SrcAddr_', axis=1).to_hdf('input/1_data_window3_botnet3.h5', key="data", mode="w")

2.0.3
