# Application of ML-based algorithm on The TON_IoT Datasets


## Library


In [1]:
!pip install torch scikit-learn pandas numpy prettyprint



In [5]:
import pandas as pd
import numpy as np
from typing import Literal, Callable
from functools import lru_cache, wraps
import timeit
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from pprint import pprint
from gc import collect

## Loading the data set


In [6]:
TRAIN_DATASET = 'UNSW_NB15_training-set.csv'
TEST_DATASET = 'UNSW_NB15_testing-set.csv'

VALIDATION_FRAC = 0.35

In [7]:
df_train = pd.read_csv(TRAIN_DATASET)
df_test = pd.read_csv(TEST_DATASET)

In [8]:
df_train.head(5)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [9]:
def print_dataframe_shape(df:pd.DataFrame,name):print(f"The shape of {name} is: {df.shape}")

In [10]:
print_dataframe_shape(df_train,'Training Set')
print_dataframe_shape(df_test,'Testing Set')

The shape of Training Set is: (175341, 45)
The shape of Testing Set is: (82332, 45)


In [11]:
Features = Literal['dur',
 'proto',
 'service',
 'state',
 'spkts',
 'dpkts',
 'sbytes',
 'dbytes',
 'rate',
 'sttl',
 'dttl',
 'sload',
 'dload',
 'sloss',
 'dloss',
 'sinpkt',
 'dinpkt',
 'sjit',
 'djit',
 'swin',
 'stcpb',
 'dtcpb',
 'dwin',
 'tcprtt',
 'synack',
 'ackdat',
 'smean',
 'dmean',
 'trans_depth',
 'response_body_len',
 'ct_srv_src',
 'ct_state_ttl',
 'ct_dst_ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'is_ftp_login',
 'ct_ftp_cmd',
 'ct_flw_http_mthd',
 'ct_src_ltm',
 'ct_srv_dst',
 'is_sm_ips_ports',
 'label']

### Understanding the data


In [12]:
y_label = 'label'
features_list = df_train.columns.tolist()

In [13]:
def separate(df:pd.DataFrame,):
    features = features_list.copy()
    features.remove(y_label)
    return df.drop(y_label,axis=1),df[y_label]

## Preprocessing the Data


### Cleaning the Data


#### Helper Function

In [30]:
def standardize(df,column:Features):
    col_values = df[column].values

    mean = np.mean(col_values)
    std = np.std(col_values)
    col_values = col_values-mean
    col_values= col_values/std
    return pd.Series(col_values, name=column)


def min_max_scaling(df, column:Features):
    col_values = df[column].values
    min_value = np.min(col_values)
    max_value = np.max(col_values)
    scaled_values = (col_values - min_value) / (max_value - min_value)
    return pd.Series(scaled_values, name=column)

def state_to_mask(state_vector: np.ndarray):
    unique_val = np.unique(state_vector)
    size = len(unique_val)
    return { unique_val[mask]:mask for mask in range(size)}

def one_hot_encoding(state_mask:dict[int,str]):
    def wrapper(mask: str):
        v = np.zeros((1, len(state_mask)))
        mask = state_mask[mask]
        v[0][mask] = 1
        return v
    return wrapper

def one_hot_vector_distance(v1: np.ndarray, v2: np.ndarray):
    if v1.shape != v2.shape:
        raise
    if np.array_equal(v1, v2):
        return 0
    return 1

def str_encoder(df:pd.DataFrame,column:Features):
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])

In [26]:

def remove_uncessaryFeature(df: pd.DataFrame, features: list = []):
    try:
        return df.drop(features, axis=1)
    except KeyError :
        return df

#### Cleaning Function ...

In [71]:
features_to_normalize=['dur','spkts','stcpb','dtcpb','dpkts','dbytes','sbytes',
                       'rate',
    'sload','dload','sloss','dloss','sinpkt','dinpkt','sjit','djit','tcprtt','synack','smean','dmean','response_body_len',]
features_to_ohe=['proto','service','state','is_ftp_login','ct_ftp_cmd','ct_flw_http_mthd','ct_state_ttl']
initial_features_to_remove = ['id','attack_cat']

def preprocess_final(df:pd.DataFrame, normalize: Literal['min_max',"standardize"]='standardize',features_to_remove:list=[]):
    
    ftr = set(features_to_remove)

    df =remove_uncessaryFeature(df,[*['attack_cat'],*features_to_remove])

    if normalize =='min_max':
        normalize = min_max_scaling
    else:
        normalize = standardize
    for feature in set(features_to_ohe).difference(ftr):
        ohe_func = one_hot_encoding(state_to_mask(df[feature]))
        df[feature]= df[feature].apply(ohe_func)
        
    for feature in set(features_to_normalize).difference(ftr):
        df[feature] = normalize(df,feature)
    
    return df
    

In [31]:

text_featuresType = ['proto','service','state']
def preprocess_partial(df:pd.DataFrame):

    df =remove_uncessaryFeature(df,['label',*initial_features_to_remove])

    for feature in text_featuresType:
        str_encoder(df,feature)

    for feature in features_to_normalize:
        df[feature] = standardize(df,feature)

    return df

### Feature Selection


##### Looking for the features that has highest impact


In [32]:
df_feature_analysis= preprocess_partial(df_train)

##### Correlation Matrix


In [84]:

def find_highest_correlation(corr_matrix:pd.DataFrame, target_feature:str):
    target_corr = corr_matrix[target_feature].drop(target_feature)
    highest_corr_feature = target_corr.idxmax()
    highest_corr_value = target_corr[highest_corr_feature]
    
    return highest_corr_feature, highest_corr_value


In [34]:
corr_matrix = df_feature_analysis.corr().apply(abs)
corr_matrix


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
dur,1.0,0.124502,0.008234,0.103443,0.254559,0.181182,0.199731,0.144134,0.120966,0.012196,...,0.0863,0.094091,0.093923,0.10176,0.020641,0.020641,0.024743,0.080871,0.115336,0.03537
proto,0.124502,1.0,0.170032,0.172441,0.013469,0.026439,0.00592,0.015812,0.013924,0.049944,...,0.191101,0.174965,0.165796,0.175708,0.018003,0.018003,0.028809,0.168121,0.198594,0.585941
service,0.008234,0.170032,1.0,0.144978,0.114403,0.077338,0.105188,0.035492,0.141709,0.295302,...,0.047685,0.038347,0.051106,0.006774,0.071051,0.071051,0.266206,0.028599,0.048011,0.088847
state,0.103443,0.172441,0.144978,1.0,0.078701,0.098268,0.0493,0.059759,0.432307,0.584697,...,0.328748,0.372309,0.408662,0.429906,0.05197,0.05197,0.078856,0.323019,0.387446,0.094198
spkts,0.254559,0.013469,0.114403,0.078701,1.0,0.390067,0.963791,0.206609,0.076358,0.102723,...,0.060194,0.068373,0.072484,0.077553,0.009951,0.009951,0.006084,0.061584,0.069598,0.01777
dpkts,0.181182,0.026439,0.077338,0.098268,0.390067,1.0,0.188476,0.971907,0.098202,0.19258,...,0.071909,0.086695,0.094267,0.094085,0.013491,0.013491,0.047974,0.07519,0.078342,0.021765
sbytes,0.199731,0.00592,0.105188,0.0493,0.963791,0.188476,1.0,0.009926,0.028468,0.02086,...,0.026661,0.02649,0.027281,0.032061,0.004515,0.004515,0.002185,0.027479,0.034553,0.006367
dbytes,0.144134,0.015812,0.035492,0.059759,0.206609,0.971907,0.009926,1.0,0.059475,0.135515,...,0.042633,0.052135,0.056901,0.054633,0.01046,0.01046,0.051403,0.045594,0.044531,0.013147
rate,0.120966,0.013924,0.141709,0.432307,0.076358,0.098202,0.028468,0.059475,1.0,0.407572,...,0.317229,0.353589,0.390721,0.383094,0.06814,0.06814,0.109297,0.310876,0.362883,0.072948
sttl,0.012196,0.049944,0.295302,0.584697,0.102723,0.19258,0.02086,0.135515,0.407572,1.0,...,0.271383,0.344104,0.37993,0.404346,0.124157,0.124157,0.112833,0.273252,0.340678,0.220429


In [35]:
corr_feature={}
corr_tresh= 0.70

for feature in df_feature_analysis.columns:
    f,score= find_highest_correlation(corr_matrix,feature)
    corr_feature[feature] = {'feature':f, 'score':score}

for feature in df_feature_analysis.columns:
    try:
        t = corr_feature[feature]['feature']
        if corr_feature[t]['feature'] == feature:
            del corr_feature[t]
    except KeyError:
        continue

corr_feature = dict(sorted(corr_feature.items(), key=lambda item: item[1]['score'], reverse=True))
corr_feature = {f:corr_feature[f] for f in corr_feature.keys() if corr_feature[f]['score'] >= corr_tresh }
corr_feature


{'is_ftp_login': {'feature': 'ct_ftp_cmd', 'score': np.float64(1.0)},
 'dbytes': {'feature': 'dloss', 'score': np.float64(0.996503594762374)},
 'sbytes': {'feature': 'sloss', 'score': np.float64(0.9961094729147967)},
 'swin': {'feature': 'dwin', 'score': np.float64(0.9901399299415929)},
 'ct_srv_src': {'feature': 'ct_srv_dst',
  'score': np.float64(0.9803230099911133)},
 'dpkts': {'feature': 'dloss', 'score': np.float64(0.9786363765710283)},
 'ct_dst_src_ltm': {'feature': 'ct_srv_dst',
  'score': np.float64(0.9723704538697349)},
 'spkts': {'feature': 'sloss', 'score': np.float64(0.9710686917738162)},
 'ct_dst_ltm': {'feature': 'ct_src_dport_ltm',
  'score': np.float64(0.9620518416459877)},
 'tcprtt': {'feature': 'synack', 'score': np.float64(0.9494676611067793)},
 'ackdat': {'feature': 'tcprtt', 'score': np.float64(0.941760373812716)},
 'sinpkt': {'feature': 'is_sm_ips_ports',
  'score': np.float64(0.941318900735516)},
 'ct_dst_sport_ltm': {'feature': 'ct_src_dport_ltm',
  'score': np.

*For now we know that **is_ftp_login** and **ct_ftp_cmd** are exactly the same, so we can remove one them*

##### PCA


In [66]:
top_n_components =  30

In [None]:
feature_cov = np.dot(df_feature_analysis.transpose(), df_feature_analysis)/len(df_feature_analysis)
eigenvalues, eigenvectors = np.linalg.eig(feature_cov)
sorted_idx = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_idx]
sorted_eigenvectors = eigenvectors[:, sorted_idx]
selected_eigenvectors = eigenvectors[:, :top_n_components]
np.dot(df_feature_analysis, selected_eigenvectors)

In [69]:
pca = PCA(n_components=top_n_components) 
pca.fit(df_feature_analysis.values)
pca_data = pca.transform(df_feature_analysis.values)
explained_variance = pca.explained_variance_ratio_
components = pca.components_
pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(pca_data.shape[1])])


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30
0,6.279233e+08,1.118485e+09,-5.239905e+07,-8.792292e+05,-11092.573454,-14091.005717,-47151.608625,-175.317457,-8719.362926,-1159.935733,...,-1.914249,4.651055,0.230970,-1.100275,0.761104,-0.553374,-1.328533,0.053145,-0.229973,0.335812
1,1.808776e+09,1.175009e+09,-1.295847e+07,-7.506645e+05,-15277.202032,16218.709316,-12892.711204,-7852.297416,-11229.658730,-304.460357,...,17.694066,-9.081430,3.394712,2.056298,-16.775822,-1.245731,4.861828,3.018090,-22.130456,-17.719447
2,2.221904e+09,6.007516e+08,7.778938e+05,-1.329194e+06,-19747.186481,-12467.038001,-3837.797355,-2051.070085,4927.523564,-83.344408,...,8.446102,-6.591510,-2.586229,0.296202,-6.159931,-1.731490,0.292242,-0.104872,-2.300718,0.319756
3,1.554388e+08,-4.181703e+07,-6.830427e+07,-7.518618e+05,-9826.343411,-10083.858311,-60102.889580,362.371053,-6437.857930,-1380.362648,...,-5.617836,3.478554,-0.791098,0.241446,-2.535214,-1.871783,-1.089748,0.430108,1.797050,-0.445397
4,1.751914e+09,-3.230289e+08,-1.500262e+07,-1.251493e+06,-18661.508973,-21461.852012,-17040.565776,1026.603179,-8650.208616,-405.285001,...,6.817104,2.196046,22.046892,0.290425,-32.580348,-3.027354,7.299865,-1.662182,-0.034607,-1.741922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,-1.368939e+09,-7.125854e+05,-6.852308e+07,-2.562988e+05,-3173.648755,-3006.892819,-12958.967169,89.186082,-1973.031015,-1823.403681,...,1.499203,2.639149,-7.152507,-0.943506,6.984245,0.070060,-0.998672,-2.397215,-0.921610,-0.472484
175337,3.559892e+09,-4.363207e+07,4.541220e+07,-1.809637e+06,-27647.976521,-33786.277720,31810.244527,1246.644947,-12356.936390,809.798835,...,-1.705813,-1.038883,0.441996,-0.916614,0.810100,-0.620791,-1.167645,-0.090954,0.648817,-0.097441
175338,-1.368939e+09,-7.125854e+05,-6.852308e+07,-2.562988e+05,-3173.648674,-3006.892718,-12958.968970,89.186142,-1973.030945,-1823.391615,...,1.176056,-0.390403,6.263283,-1.601598,-3.891259,1.430743,0.512270,-0.685330,0.910807,-0.504482
175339,-1.368939e+09,-7.125854e+05,-6.852308e+07,-2.562988e+05,-3173.648786,-3006.892855,-12958.966519,89.186051,-1973.031038,-1823.408243,...,2.132748,3.412304,-9.348007,-0.678009,7.471561,-0.659412,-0.100748,-4.518537,-1.384572,-0.655354


##### Fisher Score

### Final Preprocessing Step

*Based on the various technique we decided to remove those features* 

In [49]:
features_to_remove=['ct_ftp_cmd']
features_to_ohe = list(set(features_to_ohe).difference(features_to_remove))

['ct_flw_http_mthd',
 'ct_state_ttl',
 'state',
 'is_ftp_login',
 'service',
 'proto']

In [95]:

X_train, Y_train = separate(preprocess_final(df_train,features_to_remove=features_to_remove))
X_test, Y_test = separate(preprocess_final(df_test,features_to_remove=features_to_remove))

In [96]:
print_dataframe_shape(X_train,'Training Set')
print_dataframe_shape(X_test,'Testing Set')

The shape of Training Set is: (175341, 42)
The shape of Testing Set is: (82332, 42)


## Training


In [39]:
class Classifier:

    def __init__(self):
        self.X = None
        self.Y = None
        self.Y_Pred = None

    def fit(self):
        ...

    def train(self): ...

    def predict(self): ...

    def _cross_validation(self):
        pass

    @property
    def accuracy(self): ...

    @property
    def confusion_matrix(self): ...

    @property
    def score(self): ...

### Decision Tree


##### TreeNode class


In [14]:
class TreeNode:
    ...


class LeafNode:
    ...

##### DecisionTree class


In [15]:
class DecisionTreeClassifier(Classifier):

    def train(self):
        ...

    def predict(self):
        ...

### K-Nearest Neighbors


In [41]:
class KNNClassifier(Classifier):

    CacheIds = {}
    
    def __init__(self, max_k:int) -> None:
        super().__init__()
        self.max_k = max_k
    
    def fit(self,X_train:pd.DataFrame,Y_train:pd.DataFrame):
        self.X  = X_train.drop(self.X.columns.difference('id'))
        self.Y = Y_train
        self.x_num, self.x_ohe = self._split(X_train)        


    def _split(self,df:pd.DataFrame):
        return df.drop(features_to_ohe),df[features_to_ohe]

    def train(self,k_folds=5)->int:
        N= len(self.X)
        self.val_sampling_size = int((1/k_folds)*N)
        self.k_star = -1
        curr_acc  = -float('inf')
        for k in range(self.max_k):
            acc = self._cross_validation(k)
            if acc >  curr_acc:
                self.k_star = self.max_k
        return self.k_star
    
    def predict(self,X_Test,k_nearest):
        
        ...

    def _predict(self):...
    
    def _get_max_knn(self):
        for train_row_num,train_row_ohe in zip(self.x_num.itertuples(),self.x_ohe.itertuples()):
            for test_row_num,test_row_ohe in zip(self.x_num.itertuples(),self.x_ohe.itertuples()):
                if train_row_num.id == test_row_num.id:
                    continue
                

    def _compute_distance(self,v1:tuple[np.ndarray,np.ndarray],v2:tuple[np.ndarray,np.ndarray]):
        v1_num,v1_ohe = v1
        v2_num,v2_ohe=v2
        dist_num = float(np.sum((v1_num-v2_num)**2))
        dist_ohe = 0 
        for i in range(v1_ohe):
            dist_ohe +=one_hot_vector_distance(v1_ohe[i],v2_ohe[i])

        return dist_num + dist_ohe
         

    def _vote(self,k_nearest):
        ...
    
    @property
    def accuracy(self):
        ...

    @property
    def confusion_matrix(self):
        ...

    @property
    def score(self):
        ...



In [101]:
import numpy as np

# Example matrices
A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9],[11,12,13]])  # Shape (3, 4)
B = np.array([[1, 0, 4], [0, 1, 0], [3, 0, 1]])  # Shape (3, 3)

# Step 1: Compute squared norms of each vector in A and B
A_sq_norms = np.sum(A ** 2, axis=1).reshape(-1, 1)  # Shape (n, 1)
B_sq_norms = np.sum(B ** 2, axis=1).reshape(1, -1)  # Shape (1, m)

# Step 2: Compute the dot product of A and B^T
dot_product = np.dot(A, B.T)  # Shape (n, m)

# Step 3: Compute the Euclidean distances
euclidean_distances = np.sqrt(A_sq_norms + B_sq_norms - 2 * dot_product)

print(euclidean_distances)


[[ 2.23606798  3.31662479  3.46410162]
 [ 6.164414    8.24621125  7.14142843]
 [11.18033989 13.37908816 12.        ]
 [18.02775638 20.27313493 18.76166304]]


## Testing


## Conclusion
