# Application of ML-based algorithm on The TON_IoT Datasets


## Library


In [19]:
#!pip install torch scikit-learn pandas numpy prettyprint cupy

In [20]:
import pandas as pd
import numpy as np
from typing import Literal, Callable
from functools import lru_cache, wraps
import timeit
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from pprint import pprint
from gc import collect
from time import sleep
import cupy as cp
from tqdm import tqdm


## Loading the data set


In [21]:
TRAIN_DATASET = 'UNSW_NB15_training-set.csv'
TEST_DATASET = 'UNSW_NB15_testing-set.csv'

VALIDATION_FRAC = 0.35

In [22]:
df_train = pd.read_csv(TRAIN_DATASET)
df_test = pd.read_csv(TEST_DATASET)

In [23]:
df_train.head(5)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [24]:
def print_dataframe_shape(df:pd.DataFrame,name):print(f"The shape of {name} is: {df.shape}")

In [25]:
print_dataframe_shape(df_train,'Training Set')
print_dataframe_shape(df_test,'Testing Set')

The shape of Training Set is: (175341, 45)
The shape of Testing Set is: (82332, 45)


In [26]:
Features = Literal['dur',
 'proto',
 'service',
 'state',
 'spkts',
 'dpkts',
 'sbytes',
 'dbytes',
 'rate',
 'sttl',
 'dttl',
 'sload',
 'dload',
 'sloss',
 'dloss',
 'sinpkt',
 'dinpkt',
 'sjit',
 'djit',
 'swin',
 'stcpb',
 'dtcpb',
 'dwin',
 'tcprtt',
 'synack',
 'ackdat',
 'smean',
 'dmean',
 'trans_depth',
 'response_body_len',
 'ct_srv_src',
 'ct_state_ttl',
 'ct_dst_ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'is_ftp_login',
 'ct_ftp_cmd',
 'ct_flw_http_mthd',
 'ct_src_ltm',
 'ct_srv_dst',
 'is_sm_ips_ports',
 'label']

### Understanding the data


In [27]:
y_label = 'label'
features_list = df_train.columns.tolist()

In [28]:
def separate(df:pd.DataFrame,):
    features = features_list.copy()
    features.remove(y_label)
    return df.drop(y_label,axis=1),df[y_label]

## Preprocessing the Data


### Cleaning the Data


#### Helper Function

In [29]:
def standardize(df,column:Features):
    col_values = df[column].values

    mean = np.mean(col_values)
    std = np.std(col_values)
    col_values = col_values-mean
    col_values= col_values/std
    return pd.Series(col_values, name=column)


def min_max_scaling(df, column:Features):
    col_values = df[column].values
    min_value = np.min(col_values)
    max_value = np.max(col_values)
    scaled_values = (col_values - min_value) / (max_value - min_value)
    return pd.Series(scaled_values, name=column)

def state_to_mask(state_vector: np.ndarray):
    unique_val = np.unique(state_vector)
    size = len(unique_val)
    return { unique_val[mask]:mask for mask in range(size)}

def one_hot_encoding(state_mask:dict[int,str]):
    def wrapper(mask: str):
        v = np.zeros((1, len(state_mask)))
        mask = state_mask[mask]
        v[0][mask] = 1
        return v
    return wrapper

def one_hot_vector_distance(v1: np.ndarray, v2: np.ndarray):
    if v1.shape != v2.shape:
        raise
    if np.array_equal(v1, v2):
        return 0
    return 1

def str_encoder(df:pd.DataFrame,column:Features):
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])

In [30]:

def remove_uncessaryFeature(df: pd.DataFrame, features: list = []):
    try:
        return df.drop(features, axis=1)
    except KeyError :
        return df

#### Cleaning Function ...

In [31]:
features_to_normalize=['dur','spkts','stcpb','dtcpb','dpkts','dbytes','sbytes',
                       'rate',
    'sload','dload','sloss','dloss','sinpkt','dinpkt','sjit','djit','tcprtt','synack','smean','dmean','response_body_len',]
features_to_ohe=['proto','service','state','is_ftp_login','ct_ftp_cmd','ct_flw_http_mthd','ct_state_ttl']
initial_features_to_remove = ['id','attack_cat']

def preprocess_final(df:pd.DataFrame, normalize: Literal['min_max',"standardize"]='standardize',features_to_remove:list=[]):

    ftr = set(features_to_remove)

    df =remove_uncessaryFeature(df,[*initial_features_to_remove,*features_to_remove])

    if normalize =='min_max':
        normalize = min_max_scaling
    else:
        normalize = standardize
    for feature in set(features_to_ohe).difference(ftr):
        #ohe_func = one_hot_encoding(state_to_mask(df[feature]))
        str_encoder(df,feature)

    for feature in set(features_to_normalize).difference(ftr):
        df[feature] = normalize(df,feature)

    return df


In [32]:

text_featuresType = ['proto','service','state']
def preprocess_partial(df:pd.DataFrame):

    df =remove_uncessaryFeature(df,['label',*initial_features_to_remove])

    for feature in text_featuresType:
        str_encoder(df,feature)

    for feature in features_to_normalize:
        df[feature] = standardize(df,feature)

    return df

### Feature Selection


##### Looking for the features that has highest impact


In [33]:
df_feature_analysis= preprocess_partial(df_train)

##### Correlation Matrix


In [34]:

def find_highest_correlation(corr_matrix:pd.DataFrame, target_feature:str):
    target_corr = corr_matrix[target_feature].drop(target_feature)
    highest_corr_feature = target_corr.idxmax()
    highest_corr_value = target_corr[highest_corr_feature]

    return highest_corr_feature, highest_corr_value


In [35]:
corr_matrix = df_feature_analysis.corr().apply(abs)
corr_matrix


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
dur,1.0,0.124502,0.008234,0.103443,0.254559,0.181182,0.199731,0.144134,0.120966,0.012196,...,0.0863,0.094091,0.093923,0.10176,0.020641,0.020641,0.024743,0.080871,0.115336,0.03537
proto,0.124502,1.0,0.170032,0.172441,0.013469,0.026439,0.00592,0.015812,0.013924,0.049944,...,0.191101,0.174965,0.165796,0.175708,0.018003,0.018003,0.028809,0.168121,0.198594,0.585941
service,0.008234,0.170032,1.0,0.144978,0.114403,0.077338,0.105188,0.035492,0.141709,0.295302,...,0.047685,0.038347,0.051106,0.006774,0.071051,0.071051,0.266206,0.028599,0.048011,0.088847
state,0.103443,0.172441,0.144978,1.0,0.078701,0.098268,0.0493,0.059759,0.432307,0.584697,...,0.328748,0.372309,0.408662,0.429906,0.05197,0.05197,0.078856,0.323019,0.387446,0.094198
spkts,0.254559,0.013469,0.114403,0.078701,1.0,0.390067,0.963791,0.206609,0.076358,0.102723,...,0.060194,0.068373,0.072484,0.077553,0.009951,0.009951,0.006084,0.061584,0.069598,0.01777
dpkts,0.181182,0.026439,0.077338,0.098268,0.390067,1.0,0.188476,0.971907,0.098202,0.19258,...,0.071909,0.086695,0.094267,0.094085,0.013491,0.013491,0.047974,0.07519,0.078342,0.021765
sbytes,0.199731,0.00592,0.105188,0.0493,0.963791,0.188476,1.0,0.009926,0.028468,0.02086,...,0.026661,0.02649,0.027281,0.032061,0.004515,0.004515,0.002185,0.027479,0.034553,0.006367
dbytes,0.144134,0.015812,0.035492,0.059759,0.206609,0.971907,0.009926,1.0,0.059475,0.135515,...,0.042633,0.052135,0.056901,0.054633,0.01046,0.01046,0.051403,0.045594,0.044531,0.013147
rate,0.120966,0.013924,0.141709,0.432307,0.076358,0.098202,0.028468,0.059475,1.0,0.407572,...,0.317229,0.353589,0.390721,0.383094,0.06814,0.06814,0.109297,0.310876,0.362883,0.072948
sttl,0.012196,0.049944,0.295302,0.584697,0.102723,0.19258,0.02086,0.135515,0.407572,1.0,...,0.271383,0.344104,0.37993,0.404346,0.124157,0.124157,0.112833,0.273252,0.340678,0.220429


In [36]:
corr_feature={}
corr_tresh= 0.70

for feature in df_feature_analysis.columns:
    f,score= find_highest_correlation(corr_matrix,feature)
    corr_feature[feature] = {'feature':f, 'score':score}

for feature in df_feature_analysis.columns:
    try:
        t = corr_feature[feature]['feature']
        if corr_feature[t]['feature'] == feature:
            del corr_feature[t]
    except KeyError:
        continue

corr_feature = dict(sorted(corr_feature.items(), key=lambda item: item[1]['score'], reverse=True))
corr_feature = {f:corr_feature[f] for f in corr_feature.keys() if corr_feature[f]['score'] >= corr_tresh }
corr_feature


{'is_ftp_login': {'feature': 'ct_ftp_cmd', 'score': 1.0},
 'dbytes': {'feature': 'dloss', 'score': 0.996503594762374},
 'sbytes': {'feature': 'sloss', 'score': 0.9961094729147967},
 'swin': {'feature': 'dwin', 'score': 0.9901399299415929},
 'ct_srv_src': {'feature': 'ct_srv_dst', 'score': 0.9803230099911133},
 'dpkts': {'feature': 'dloss', 'score': 0.9786363765710283},
 'ct_dst_src_ltm': {'feature': 'ct_srv_dst', 'score': 0.9723704538697349},
 'spkts': {'feature': 'sloss', 'score': 0.9710686917738162},
 'ct_dst_ltm': {'feature': 'ct_src_dport_ltm', 'score': 0.9620518416459877},
 'tcprtt': {'feature': 'synack', 'score': 0.9494676611067793},
 'ackdat': {'feature': 'tcprtt', 'score': 0.941760373812716},
 'sinpkt': {'feature': 'is_sm_ips_ports', 'score': 0.941318900735516},
 'ct_dst_sport_ltm': {'feature': 'ct_src_dport_ltm',
  'score': 0.9067931558835277},
 'ct_src_ltm': {'feature': 'ct_src_dport_ltm', 'score': 0.8974378792235619},
 'dttl': {'feature': 'tcprtt', 'score': 0.807340559675238

*For now we know that **is_ftp_login** and **ct_ftp_cmd** are exactly the same, so we can remove one them*

##### PCA


In [37]:
top_n_components =  30

In [38]:
feature_cov = np.dot(df_feature_analysis.transpose(), df_feature_analysis)/len(df_feature_analysis)
eigenvalues, eigenvectors = np.linalg.eig(feature_cov)
sorted_idx = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_idx]
sorted_eigenvectors = eigenvectors[:, sorted_idx]
selected_eigenvectors = eigenvectors[:, :top_n_components]
np.dot(df_feature_analysis, selected_eigenvectors)

array([[ 5.14337814e+02, -2.47848370e+01, -7.25329246e+01, ...,
         8.82728865e-01, -5.96414470e-01, -2.78342902e-01],
       [ 4.19673946e+02, -1.70184256e+02, -4.94340563e+01, ...,
         3.75628929e-01,  3.03280944e+00, -4.39693521e-01],
       [ 4.19007545e+02, -1.71825722e+02, -5.13851754e+01, ...,
         2.00516392e-01,  1.70306558e+00, -4.23704149e-01],
       ...,
       [ 1.65078233e+02,  2.25975551e+02,  2.34039309e+01, ...,
        -8.28487896e-03,  1.02239795e-01,  5.42261022e-02],
       [ 1.67220668e+02,  2.31543480e+02,  2.94424200e+01, ...,
        -1.14948761e-02,  8.44733834e-02, -1.24984158e-03],
       [ 1.67237280e+02,  2.31594853e+02,  2.94842598e+01, ...,
        -1.40546927e-02,  8.80696029e-02, -2.48934602e-02]])

In [39]:
pca = PCA(n_components=top_n_components)
pca.fit(df_feature_analysis.values)
pca_data = pca.transform(df_feature_analysis.values)
explained_variance = pca.explained_variance_ratio_
components = pca.components_
pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(pca_data.shape[1])])


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30
0,232.735542,143.411543,1.537543,-6.262850,4.499817,1.089208,0.752605,-0.732304,-1.610667,0.752400,...,-0.281294,-2.213127,0.277900,-0.730579,0.029953,-0.730029,0.263932,0.836336,-0.549376,-0.262474
1,272.975120,-21.781318,80.781578,19.396505,-21.461913,-3.487111,-17.967769,5.931111,-9.438162,13.407337,...,-0.211272,-2.230645,0.819099,-0.384465,-0.398899,0.521334,0.324582,0.510475,3.090572,-0.646387
2,273.849964,-22.135167,82.390206,11.375026,-8.786566,-2.728823,-4.248254,0.799185,-3.066882,2.521139,...,-0.846008,-0.853795,-1.178543,0.092217,-0.893932,0.411329,-0.195827,0.293107,1.614838,-0.509801
3,274.123922,-22.257985,82.920597,8.811067,-4.659445,-2.504548,-0.108263,-0.758095,0.658715,-0.598509,...,-0.457412,-2.118875,0.211946,-1.048730,-0.313324,-1.122397,0.256969,-0.058108,0.275179,0.518582
4,228.173020,145.718854,-6.143472,22.131000,-41.664735,-1.406028,-39.603300,7.750291,-0.508367,-0.817392,...,-0.007058,-0.484326,0.323591,-0.445368,0.081950,-0.017575,-0.209533,-0.214414,-0.404713,-0.180749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,-195.707750,30.891944,-14.653541,23.204316,-17.390921,1.966095,9.643232,-0.744220,-1.938264,-1.715212,...,-0.025982,0.104042,-0.078373,0.195884,-0.155962,0.020584,0.033555,-0.011771,0.125354,-0.013609
175337,231.394285,144.183106,-0.855338,-6.316717,4.290512,1.145704,0.446148,-0.738438,-1.235383,0.348821,...,-0.121210,-0.963174,0.245972,-0.400484,-0.038401,1.196855,-0.732270,0.000579,-0.288247,-0.063582
175338,-193.496113,29.936197,-11.322303,2.890028,15.810959,1.094417,-7.240361,0.166908,1.451209,-1.415244,...,0.089992,0.173145,0.320253,0.289246,-0.323344,0.006156,-0.003020,-0.011733,0.163635,0.033367
175339,-196.527540,31.253485,-15.928559,30.688590,-29.601102,1.983058,11.348922,0.403778,-3.433630,-3.266061,...,-0.017937,0.125447,-0.134924,0.167472,-0.134461,0.018554,0.030863,-0.008628,0.093509,-0.008659


##### Fisher Score

### Final Preprocessing Step

*Based on the various technique we decided to remove those features*

In [40]:
features_to_remove=['ct_ftp_cmd']
features_to_ohe = list(set(features_to_ohe).difference(features_to_remove))

In [41]:

X_train, Y_train = separate(preprocess_final(df_train,features_to_remove=features_to_remove))
X_test, Y_test = separate(preprocess_final(df_test,features_to_remove=features_to_remove))

In [42]:
print_dataframe_shape(X_train,'Training Set')
print_dataframe_shape(X_test,'Testing Set')

The shape of Training Set is: (175341, 41)
The shape of Testing Set is: (82332, 41)


In [43]:
del df_train, df_test,df_feature_analysis,corr_matrix
collect()

20

In [44]:
X_train

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,-0.191029,113,0,2,-0.104456,-0.135769,-0.049134,-0.102726,-0.576371,252,...,0,1,1,1,1,0,0,1,1,0
1,-0.109485,113,0,2,-0.046014,0.172599,-0.046410,0.188544,-0.576345,62,...,1,1,1,1,2,0,0,1,6,0
2,0.040699,113,0,2,-0.089845,-0.026933,-0.048527,-0.012133,-0.576734,62,...,1,2,1,1,3,0,0,2,6,0
3,0.049729,113,3,2,-0.060624,-0.063212,-0.047016,-0.098563,-0.576737,62,...,1,2,1,1,3,1,0,2,1,0
4,-0.140417,113,0,2,-0.075235,-0.117630,-0.047554,-0.102057,-0.576617,254,...,1,2,2,1,40,0,0,2,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,-0.209773,119,2,3,-0.133677,-0.172047,-0.049958,-0.103923,0.094951,254,...,2,24,24,13,24,0,0,24,24,0
175337,-0.131728,113,0,2,-0.075235,-0.099490,-0.047062,-0.101459,-0.576616,254,...,1,1,1,1,2,0,0,1,1,0
175338,-0.209773,119,2,3,-0.133677,-0.172047,-0.049958,-0.103923,0.094951,254,...,2,3,3,3,13,0,0,3,12,0
175339,-0.209773,119,2,3,-0.133677,-0.172047,-0.049958,-0.103923,0.094951,254,...,2,30,30,14,30,0,0,30,30,0


## Training


In [None]:
class LabelClass

    def __init__(self,pos_class, neg_class,pos_name,neg_name) -> None:
        self.PositiveClass:int = pos_class
        self.NegativeClass:int = neg_class
        self.NegativeName:str = neg_name
        self.PositiveName:str = pos_name

problem_label_class = LabelClass(0,1,'Normal','Attack')

In [45]:
class BinaryClassifier:

    def __init__(self,label_class=problem_label_class):
        self.X = None
        self.Y = None
        self.Y_Pred:list = None
        self.Y_PredProba=[]


        self.label_class = label_class

    def fit(self):
        ...

    def predict(self): ...

    def _compute_analysis(self,y_test):
        self.TP=0
        self.TN=0
        self.FP=0
        self.FN=0

        # Positive class = 0
        # Negative class = 1

        for truth,pred in zip(y_test,self.Y_Pred):
            if truth ==self.label_class.NegativeClass and pred ==self.label_class.NegativeClass:
              self.TN+=1
            elif truth ==self.label_class.PositiveClass and pred ==self.label_class.PositiveClass:
              self.TP+=1
            elif truth ==self.label_class.NegativeClass and pred == self.label_class.PositiveClass:
              self.FP+=1
            else
              self.FN+=1
        self.roc_info = roc_curve(y_test, self.Y_PredProba)
        self.precsion_recall_info = precision_recall_curve(y_test, self.Y_PredProba)

    def plot_confusion_matrix(self):
      confusion_matrix = np.array([[self.TN, self.FP], [self.FN, self.TP]])

      plt.figure(figsize=(6, 4))
      sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
                  xticklabels=['Predicted Negative', 'Predicted Positive'],
                  yticklabels=['Actual Negative', 'Actual Positive'])

      plt.title('Confusion Matrix')
      plt.xlabel('Predicted Labels')
      plt.ylabel('True Labels')
      plt.show()

    def plot_roc_curve(self):
      fpr, tpr, thresholds_roc =self.roc_info
      plt.figure()
      plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {auc(fpr, tpr):0.2f})')
      plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
      plt.xlabel('False Positive Rate')
      plt.ylabel('True Positive Rate')
      plt.title('ROC Curve')
      plt.legend(loc="lower right")
      plt.show()

    def plot_precision_recall_curve(self):
      precision, recall, thresholds_pr = self.precision_recall_info
      plt.figure()
      plt.plot(recall, precision, color='b', lw=2)
      plt.xlabel('Recall')
      plt.ylabel('Precision')
      plt.title('Precision-Recall Curve')
      plt.show()

    @property
    def accuracy(self):
      return self.TP + self.TN/self.TP + self.TN +self.FP + self.FN

    @property
    def f1_score(self):
      2*self.precision * self.recall/self.precision+self.recall

    @property
    def precision(self):
      return self.TP/self.TP + self.FP

    @property
    def recall(self):
      return self.TP/self.TP + self.FN


### Decision Tree


##### TreeNode class


In [46]:
class TreeNode:
    ...


class LeafNode:
    ...

##### DecisionTree class


In [47]:
class DecisionTreeClassifier(BinaryClassifier):

    def train(self):
        ...

    def _entropy(self):
      ...

    def _information_gain(self):
      ...



### K-Nearest Neighbors


In [48]:
class KNNClassifier(BinaryClassifier):

    def __init__(self, max_k:int=None,N_batch=100 ) -> None:
        super().__init__()
        if max_k is not None
          self.K = self._to_odd_number( max_k-1)
        else:
          self.K = self._to_odd_number(round(max_k**0.5))
        self.N_batch = N_batch
        self.ohe_func = cp.vectorize(self._to_one_hot_encoding)

    def fit(self,X_train:pd.DataFrame,Y_train:pd.DataFrame):
        self.X  = X_train
        self.Y = Y_train
        self.x_num, self.x_ohe = self._split(X_train)

    def _to_odd_number(self, val):
        return val-1 if val%2 == 0 else val

    def _split(self,df:pd.DataFrame):
        return df.drop(features_to_ohe),df[features_to_ohe]

    def predict(self,x_test,y_test):
      dataframes_indices = self._predict(self._split(x_test))
      df_distances = pd.DataFrame(pd.concat(dataframes_indices).apply(self.prevote,axis=1))
      self.Y_Pred = df_distances.label.apply(self._vote_majority).values
      del dataframes_indices, df_distances
      collect()
      self._compute_analysis()

    def _prevote(row):
      return [int(Y_train[i]) for i in  row.tolist()]

    def _predict(self,test:tuple):
      test_x_num, test_x_ohe = test
      N =len(test_x_num)
      fold_size =  N / self.N_batch
      dataframes_indices = []
      #sleep(100)
      for i in tqdm(range(self.N_batch)):
        cp.get_default_memory_pool().free_all_blocks()
        a,b= round(fold_size*i),round(fold_size*(i+1))
        num,ohe = test_x_num[a:b],test_x_ohe[a:b]
        temp = self._compute_distance(num,self.x_num) + self.ohe_func(self._compute_distance(ohe,self.x_ohe))
        top_500_indices = cp.argsort(temp, axis=1)[:, :self.K] # TODO check give the label now
        dataframes_indices.append(pd.DataFrame(top_500_indices.get()))
        del temp, top_500_indices
        collect()
        print(f'\nCalculating distance of the max k in Batch: {i+1}/{self.N_batch}')
        sleep(0.1)
      return dataframes_indices

    def _to_one_hot_encoding(x):
      return 0 if x == 0 else 1

    def _compute_distance(a,b):
      A = a.to_numpy(dtype='float32')
      B = b.to_numpy(dtype='float32')
      A = cp.asarray(A)
      B = cp.asarray(B)
      A_sq_norms = cp.sum(A ** 2, axis=1).reshape(-1, 1)  # Shape (n, 1)
      B_sq_norms = cp.sum(B ** 2, axis=1).reshape(1, -1)  # Shape (1, m)

      dot_product = cp.dot(A, B.T)  # Shape (n, m)
      euclidean_distances = A_sq_norms + B_sq_norms - 2 * dot_product
      del A_sq_norms,B_sq_norms, dot_product, A,B
      collect()
      return euclidean_distances

    def _vote_majority(self,label_vectors):
      n = len(label_vectors)
      sum_one = label_vectors.count(1)
      sum_zero =n-sum_one
      self.Y_PredProba.append(sum_zero/n)
      return 1 if sum_one > len(label_vectors)-sum_one else 0



## Testing


## Conclusion
