# Application of ML-based algorithm on The TON_IoT Datasets


## Library


In [None]:
#!pip install torch scikit-learn pandas numpy prettyprint cupy tqdm matplotlib

In [1]:
import pandas as pd
import numpy as np
from typing import Literal, Callable
import timeit
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from pprint import pprint
from gc import collect
from time import sleep
from sklearn.model_selection import train_test_split

#import cupy as cp
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from colorama import Fore, Back, Style


## Loading the data set


In [2]:
TRAIN_DATASET = 'UNSW_NB15_training-set.csv'
TEST_DATASET = 'UNSW_NB15_testing-set.csv'

VALIDATION_FRAC = 0.35

In [3]:
df_train = pd.read_csv(TRAIN_DATASET)
df_test = pd.read_csv(TEST_DATASET)

In [4]:
df_train.head(5)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [5]:
def print_dataframe_shape(df:pd.DataFrame,name):print(f"The shape of {name} is: {df.shape}")

In [6]:
print_dataframe_shape(df_train,'Training Set')
print_dataframe_shape(df_test,'Testing Set')

The shape of Training Set is: (175341, 45)
The shape of Testing Set is: (82332, 45)


In [7]:
Features = Literal['dur',
 'proto',
 'service',
 'state',
 'spkts',
 'dpkts',
 'sbytes',
 'dbytes',
 'rate',
 'sttl',
 'dttl',
 'sload',
 'dload',
 'sloss',
 'dloss',
 'sinpkt',
 'dinpkt',
 'sjit',
 'djit',
 'swin',
 'stcpb',
 'dtcpb',
 'dwin',
 'tcprtt',
 'synack',
 'ackdat',
 'smean',
 'dmean',
 'trans_depth',
 'response_body_len',
 'ct_srv_src',
 'ct_state_ttl',
 'ct_dst_ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'is_ftp_login',
 'ct_ftp_cmd',
 'ct_flw_http_mthd',
 'ct_src_ltm',
 'ct_srv_dst',
 'is_sm_ips_ports',
 'label']

### Understanding the data


In [8]:
y_label = 'label'
features_list = df_train.columns.tolist()

In [9]:
def separate(df:pd.DataFrame,):
    features = features_list.copy()
    features.remove(y_label)
    return df.drop(y_label,axis=1),df[y_label]

## Preprocessing the Data


### Cleaning the Data


#### Helper Function

In [10]:
def standardize(df,column:Features):
    col_values = df[column].values

    mean = np.mean(col_values)
    std = np.std(col_values)
    col_values = col_values-mean
    col_values= col_values/std
    return pd.Series(col_values, name=column)


def min_max_scaling(df, column:Features):
    col_values = df[column].values
    min_value = np.min(col_values)
    max_value = np.max(col_values)
    scaled_values = (col_values - min_value) / (max_value - min_value)
    return pd.Series(scaled_values, name=column)

def state_to_mask(state_vector: np.ndarray):
    unique_val = np.unique(state_vector)
    size = len(unique_val)
    return { unique_val[mask]:mask for mask in range(size)}

def one_hot_encoding(state_mask:dict[int,str]):
    def wrapper(mask: str):
        v = np.zeros((1, len(state_mask)))
        mask = state_mask[mask]
        v[0][mask] = 1
        return v
    return wrapper

def one_hot_vector_distance(v1: np.ndarray, v2: np.ndarray):
    if v1.shape != v2.shape:
        raise
    if np.array_equal(v1, v2):
        return 0
    return 1

def str_encoder(df:pd.DataFrame,column:Features):
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])

In [11]:

def remove_uncessaryFeature(df: pd.DataFrame, features: list = []):
    try:
        return df.drop(features, axis=1)
    except KeyError :
        return df

#### Cleaning Function ...

In [12]:
features_to_normalize=['dur','spkts','stcpb','dtcpb','dpkts','dbytes','sbytes',
                       'rate',
    'sload','dload','sloss','dloss','sinpkt','dinpkt','sjit','djit','tcprtt','synack','smean','dmean','response_body_len',]
features_to_ohe=['proto','service','state','is_ftp_login','ct_ftp_cmd','ct_flw_http_mthd','ct_state_ttl']
initial_features_to_remove = ['id','attack_cat']

def preprocess_final(df:pd.DataFrame, normalize: Literal['min_max_scaling','standardize']=standardize,features_to_remove:list=[]):

    ftr = set(features_to_remove)
    df =remove_uncessaryFeature(df,[*initial_features_to_remove,*features_to_remove])
    
    for feature in set(features_to_ohe).difference(ftr):
        #ohe_func = one_hot_encoding(state_to_mask(df[feature]))
        str_encoder(df,feature)

    for feature in set(features_to_normalize).difference(ftr):
        df[feature] = normalize(df,feature)

    return df


In [14]:

text_featuresType = ['proto','service','state']
def preprocess_partial(df:pd.DataFrame):

    df =remove_uncessaryFeature(df,['label',*initial_features_to_remove])

    for feature in text_featuresType:
        str_encoder(df,feature)

    for feature in features_to_normalize:
        df[feature] = standardize(df,feature)

    return df

## Feature Selection


##### Looking for the features that has highest impact


In [15]:
df_feature_analysis= preprocess_partial(df_train)

##### Correlation Matrix


In [16]:
def find_highest_correlation(corr_matrix:pd.DataFrame, target_feature:str):
    target_corr = corr_matrix[target_feature].drop(target_feature)
    highest_corr_feature = target_corr.idxmax()
    highest_corr_value = target_corr[highest_corr_feature]
    
    return highest_corr_feature, highest_corr_value


In [17]:
corr_matrix = df_feature_analysis.corr().apply(abs)
corr_matrix

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
dur,1.0,0.124502,0.008234,0.103443,0.254559,0.181182,0.199731,0.144134,0.120966,0.012196,...,0.0863,0.094091,0.093923,0.10176,0.020641,0.020641,0.024743,0.080871,0.115336,0.03537
proto,0.124502,1.0,0.170032,0.172441,0.013469,0.026439,0.00592,0.015812,0.013924,0.049944,...,0.191101,0.174965,0.165796,0.175708,0.018003,0.018003,0.028809,0.168121,0.198594,0.585941
service,0.008234,0.170032,1.0,0.144978,0.114403,0.077338,0.105188,0.035492,0.141709,0.295302,...,0.047685,0.038347,0.051106,0.006774,0.071051,0.071051,0.266206,0.028599,0.048011,0.088847
state,0.103443,0.172441,0.144978,1.0,0.078701,0.098268,0.0493,0.059759,0.432307,0.584697,...,0.328748,0.372309,0.408662,0.429906,0.05197,0.05197,0.078856,0.323019,0.387446,0.094198
spkts,0.254559,0.013469,0.114403,0.078701,1.0,0.390067,0.963791,0.206609,0.076358,0.102723,...,0.060194,0.068373,0.072484,0.077553,0.009951,0.009951,0.006084,0.061584,0.069598,0.01777
dpkts,0.181182,0.026439,0.077338,0.098268,0.390067,1.0,0.188476,0.971907,0.098202,0.19258,...,0.071909,0.086695,0.094267,0.094085,0.013491,0.013491,0.047974,0.07519,0.078342,0.021765
sbytes,0.199731,0.00592,0.105188,0.0493,0.963791,0.188476,1.0,0.009926,0.028468,0.02086,...,0.026661,0.02649,0.027281,0.032061,0.004515,0.004515,0.002185,0.027479,0.034553,0.006367
dbytes,0.144134,0.015812,0.035492,0.059759,0.206609,0.971907,0.009926,1.0,0.059475,0.135515,...,0.042633,0.052135,0.056901,0.054633,0.01046,0.01046,0.051403,0.045594,0.044531,0.013147
rate,0.120966,0.013924,0.141709,0.432307,0.076358,0.098202,0.028468,0.059475,1.0,0.407572,...,0.317229,0.353589,0.390721,0.383094,0.06814,0.06814,0.109297,0.310876,0.362883,0.072948
sttl,0.012196,0.049944,0.295302,0.584697,0.102723,0.19258,0.02086,0.135515,0.407572,1.0,...,0.271383,0.344104,0.37993,0.404346,0.124157,0.124157,0.112833,0.273252,0.340678,0.220429


In [18]:
corr_feature={}
corr_tresh= 0.94

for feature in df_feature_analysis.columns:
    f,score= find_highest_correlation(corr_matrix,feature)
    corr_feature[feature] = {'feature':f, 'score':score}

for feature in df_feature_analysis.columns:
   try:
       t = corr_feature[feature]['feature']
       if corr_feature[t]['feature'] == feature:
           del corr_feature[t]
   except KeyError:
       continue

corr_feature = dict(sorted(corr_feature.items(), key=lambda item: item[1]['score'], reverse=True))
corr_feature = {f:corr_feature[f] for f in corr_feature.keys() if corr_feature[f]['score'] >= corr_tresh }
corr_feature


{'is_ftp_login': {'feature': 'ct_ftp_cmd', 'score': np.float64(1.0)},
 'dbytes': {'feature': 'dloss', 'score': np.float64(0.996503594762374)},
 'sbytes': {'feature': 'sloss', 'score': np.float64(0.9961094729147967)},
 'swin': {'feature': 'dwin', 'score': np.float64(0.9901399299415929)},
 'ct_srv_src': {'feature': 'ct_srv_dst',
  'score': np.float64(0.9803230099911133)},
 'dpkts': {'feature': 'dloss', 'score': np.float64(0.9786363765710283)},
 'ct_dst_src_ltm': {'feature': 'ct_srv_dst',
  'score': np.float64(0.9723704538697349)},
 'spkts': {'feature': 'sloss', 'score': np.float64(0.9710686917738162)},
 'ct_dst_ltm': {'feature': 'ct_src_dport_ltm',
  'score': np.float64(0.9620518416459877)},
 'tcprtt': {'feature': 'synack', 'score': np.float64(0.9494676611067793)},
 'ackdat': {'feature': 'tcprtt', 'score': np.float64(0.941760373812716)},
 'sinpkt': {'feature': 'is_sm_ips_ports',
  'score': np.float64(0.941318900735516)}}

In [19]:
def compare_features(feature:list[Features],order_by:Features, ascending:bool=True):
    if y_label not in feature:
        feature.append(y_label)
        print(feature)
    if order_by not in feature:
        raise ValueError('order_by must be in the feature parameter')
    
    if order_by == y_label:
        raise ValueError(f'cannot order_by the {y_label}')
    return df_train[feature].sort_values(by=order_by,axis=0,ascending=ascending)

*For now we know that **is_ftp_login** and **ct_ftp_cmd** are exactly the same, so we can remove one them*

In [25]:
features_to_remove=['is_ftp_login']
#features_to_remove=['is_ftp_login','sbytes','dbytes','swin','dpkts','spkts']

##### PCA


In [21]:
top_n_components =  30

In [26]:
feature_cov = np.dot(df_feature_analysis.transpose(), df_feature_analysis)/len(df_feature_analysis)
eigenvalues, eigenvectors = np.linalg.eig(feature_cov)
pca_index= np.argsort(eigenvalues)[::-1][:top_n_components]
pca_feature = df_feature_analysis.columns[pca_index]
pca_feature

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len'],
      dtype='object')

In [27]:
def toPCA_space(df:pd.DataFrame,pca_list,top_n_components=top_n_components):
    pca = PCA(n_components=top_n_components)
    pca.fit(df.values)
    pca_data = pca.transform(df.values)
    return pd.DataFrame(pca_data, columns=pca_list),pca.explained_variance_ratio_

### Final Preprocessing Step

*Based on the various technique we decided to remove those features*

In [28]:
features_to_ohe = list(set(features_to_ohe).difference(features_to_remove))

In [30]:
X_train, Y_train = separate(preprocess_final(df_train,features_to_remove=features_to_remove))
X_test, Y_test = separate(preprocess_final(df_test,features_to_remove=features_to_remove))
#X_train_PCA = toPCA_space(df_train.drop([y_label]),pca_feature.to_list(),)

In [None]:
#X_train = X_train_PCA

In [31]:
print_dataframe_shape(X_train,'Training Set')
print_dataframe_shape(X_test,'Testing Set')

The shape of Training Set is: (175341, 41)
The shape of Testing Set is: (82332, 41)


In [None]:
del df_train, df_test,df_feature_analysis,corr_matrix
collect()

## Model


In [32]:
class LabelClass:

    def __init__(self,pos_class, neg_class,pos_name,neg_name,prefered_class=None) -> None:
        self.PositiveClass:int = pos_class
        self.NegativeClass:int = neg_class
        self.NegativeName:str = neg_name
        self.PositiveName:str = pos_name
        self.PreferedClass= self.NegativeClass if prefered_class is None else self.PositiveClass

        self.answer={
            self.PositiveClass: self.PositiveName,
            self.NegativeClass: self.NegativeName
        }
    
problem_label_class = LabelClass(0,1,'Normal','Attack')

In [33]:
class BinaryClassifier:

    def __init__(self,label_class=problem_label_class):
        self.X = None
        self.Y = None
        self.Y_Pred:list = None
        self.Y_PredProba=[]
        self.label_class = label_class

    def fit(self):
      ...

    def predict(self): 
      ...

    def _compute_analysis(self,y_test):
        self.TP=0
        self.TN=0
        self.FP=0
        self.FN=0

        # Positive class = 0
        # Negative class = 1

        for truth,pred in zip(y_test,self.Y_Pred):
            if truth ==self.label_class.NegativeClass and pred ==self.label_class.NegativeClass:
              self.TN+=1
            elif truth ==self.label_class.PositiveClass and pred ==self.label_class.PositiveClass:
              self.TP+=1
            elif truth ==self.label_class.NegativeClass and pred == self.label_class.PositiveClass:
              self.FP+=1
            else:
              self.FN+=1
        self.roc_info = roc_curve(y_test, self.Y_PredProba)
        self.precision_recall_info = precision_recall_curve(y_test, self.Y_PredProba)

    def plot_confusion_matrix(self):
      confusion_matrix = np.array([[self.TN, self.FP], [self.FN, self.TP]])

      plt.figure(figsize=(6, 4))
      sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
                  xticklabels=['Predicted Negative', 'Predicted Positive'],
                  yticklabels=['Actual Negative', 'Actual Positive'])

      plt.title('Confusion Matrix')
      plt.xlabel('Predicted Labels')
      plt.ylabel('True Labels')
      plt.show()

    def plot_roc_curve(self):
      fpr, tpr, thresholds_roc =self.roc_info
      plt.figure()
      plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {auc(fpr, tpr):0.2f})')
      plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
      plt.xlabel('False Positive Rate')
      plt.ylabel('True Positive Rate')
      plt.title('ROC Curve')
      plt.legend(loc="lower right")
      plt.show()

    def plot_precision_recall_curve(self):
      precision, recall, thresholds_pr = self.precision_recall_info
      plt.figure()
      plt.plot(recall, precision, color='b', lw=2)
      plt.xlabel('Recall')
      plt.ylabel('Precision')
      plt.title('Precision-Recall Curve')
      plt.show()

    @property
    def accuracy(self):
      return (self.TP + self.TN)/(self.TP + self.TN +self.FP + self.FN)

    @property
    def f1_score(self):
      return (2*self.precision * self.recall)/(self.precision+self.recall)

    @property
    def precision(self):
      return (self.TP)/(self.TP + self.FP)

    @property
    def recall(self):
      return self.TP/(self.TP + self.FN)


### Decision Tree


##### Question

In [48]:

class Question:

    def __init__(self,feature:str,value:float,information_gain:float):
        self.feature = feature
        self.value = value
        self.information_gain = information_gain
        
    def split(self,dataset:pd.DataFrame)->tuple[pd.DataFrame,pd.DataFrame]:
        ...

    def __repr__(self,_type):
        return f'Is {Style.DIM}{self.feature}{Style.RESET_ALL} {Style.BRIGHT}{_type}{Style.RESET_ALL} to {Style.DIM}{self.value}{Style.RESET_ALL} ? - Gain[{self.information_gain}]'

    def match(self,vector:pd.Series) -> bool:
        ...
        
    def __eq__(self, other):
        return self.information_gain == other.information_gain

    def __ne__(self, other):
        return self.information_gain != other.information_gain

    def __gt__(self, other):
        return self.information_gain > other.information_gain

    def __ge__(self, other):
        return self.information_gain >= other.information_gain

class QuestionEqual(Question):
    def split(self, dataset):
       return dataset[dataset[self.feature] == self.value], dataset[dataset[self.feature] != self.value]
    
    def match(self,vector):
        return vector[self.feature] == self.value
    
    def __repr__(self):
        return super().__repr__('equal')
    

class QuestionThresh(Question):
    def split(self, dataset):
       return dataset[dataset[self.feature] >= self.value], dataset[dataset[self.feature] < self.value]
    
    def match(self,vector):
        return vector[self.feature] >= self.value
    
    def __repr__(self):
        return super().__repr__('greater or equal')


##### Node class


In [36]:
class Node:
    ...

class TreeNode(Node):
    def __init__(self,question:Question,left:Node,right:Node):
        self.question = question
        """
        Satisfy the match
        """
        self.left=left
        """
        Dissatisfy the match
        """
        self.right = right
    
    def match(self,value) -> bool:
        return self.question.match(value)
    
    def __repr__(self):
        return repr(self.question)
    

class LeafNode(Node):

    def __init__(self,probabilities,label_class:LabelClass):
        self.proba = probabilities
        self.label_class = label_class
        self.answer:Literal[0,1,None] = self._compute_answer()
        
    def _compute_answer(self):
        label_0 = self.proba[0]
        label_1 = self.proba[1]

        if label_0 == label_1:
            return self.label_class.PreferedClass
        
        return 1 if label_1 > label_0 else 0
            
    @property
    def answer_proba(self):
        return self.proba[self.answer]

    def repr(self):
        return f'It is {Style.DIM}{self.label_class.answer[self.answer]}{Style.RESET_ALL} with a probability of {self.answer_proba:.4f} %'


##### DecisionTree class


In [37]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

In [49]:
ImpurityType=Literal['gini_index','entropy']

class DecisionTreeClassifier(BinaryClassifier):
  
    def __init__(self,max_height:int,min_information_gain:float,min_sample:int,impurity:ImpurityType='entropy',label_class=problem_label_class):
      super().__init__(label_class)
      self.max_height = max_height
      self.min_information_gain = min_information_gain
      self.min_sample = min_sample
      self.impurity = self._gini_impurity if impurity == 'gini_index' else self._entropy
      self.root: TreeNode = None

    def fit(self,x_train:pd.DataFrame,y_train:pd.DataFrame):
      self.X = x_train
      self.Y = y_train
      self.root_dataset = pd.concat([self.X,self.Y],axis=1)
       
    def train(self,x_val,y_val):
      self.root =self._build_tree(self.root_dataset)
      self.predict(x_val,y_val)

    def repr(self):
      return f'DecisionTree(Max_Depth={self.max_height},Min_Inf_Gain={self.min_information_gain},Min_Sample={self.min_sample},Impurity={self.impurity})'
    
    def predict(self,x_to_test,y_to_test):
      self.Y_Pred = self._predict(x_to_test)
      self._compute_analysis(y_to_test)

    def _predict(self,x_to_test:pd.DataFrame):
      return x_to_test.drop([y_label]).apply(self._traverse_tree,axis=0).values
      
    def _entropy(self,labels:np.ndarray):
      probabilities = self._compute_target_probabilities(labels)
      return -np.sum(probabilities * np.log2(probabilities))

    def _gini_impurity(self,labels:np.ndarray):
      probabilities = self._compute_target_probabilities(labels)
      return (1 - np.sum(probabilities**2))

    def _compute_target_probabilities(self,labels):
      _, counts = np.unique(labels, return_counts=True)
      return counts / len(labels)

    def _information_gain(self,current_information_gain:float,mean_impurity:float):
      return current_information_gain - mean_impurity

    def _build_tree(self,dataset:pd.DataFrame,current_depth:int =0) ->TreeNode | LeafNode:
        parent_gain = self.impurity(dataset)
        current_n = len(dataset)

        if current_depth >= self.max_height or current_n < self.min_sample or parent_gain < self.min_information_gain:
          proba= self._compute_target_probabilities(dataset[y_label].values)
          return LeafNode(proba,self.label_class)
        
        best_question=self._find_best_split(dataset,parent_gain)

        left_dataset,right_dataset =self._split_dataset(dataset,best_question)
        left_child = self._build_tree(left_dataset,current_depth+1)
        right_child = self._build_tree(right_dataset,current_depth+1)

        return TreeNode(best_question,left_child,right_child)

    def _traverse_tree(self,x_vector:pd.Series):
      current_node:LeafNode | TreeNode = self.root
      while isinstance(current_node,TreeNode):
          answer = current_node.match(x_vector)
          current_node = current_node.left if answer else current_node.right
      
      self.Y_PredProba.append(current_node.proba)
      return current_node.answer

    def print_tree(self,):
        self._print_tree(self.root,0)

    def _print_tree(self,node:Node| TreeNode, depth,answer =None):
        print('' if answer is None else answer,' '*depth,node)
        if type(node) is TreeNode:
          self._print_tree(node.left,depth+1,'YES... ')
          self._print_tree(node.right,depth+1,'NO... ')

    def _split_dataset(self,dataset:pd.DataFrame,question:Question):
      return question.split(dataset)

    def _find_best_split(self,dataset:pd.DataFrame,current_gain:float)->Question:
      
      best_question = Question(None,None,float('inf'))
      for feature in dataset.columns:
        if feature in features_to_ohe: 
           for values in dataset[feature].unique():
            best_question = self._compute_best_question(dataset, current_gain,feature, values,best_question,QuestionEqual)            
        else:
          val_unique_mean = dataset[feature].unique().mean()
          val_mean = dataset[feature].mean()
          val_median = dataset[feature].median()

          for values in [val_unique_mean,val_mean,val_median]:
            best_question = self._compute_best_question(dataset, current_gain,feature, values,best_question,QuestionThresh)

      return Question
    
    def _compute_best_question(self, dataset:pd.DataFrame, current_gain:float,feature:str, values:float,best_question:Question,Q_type:type) ->Question:
        N = len(dataset)
        if Q_type== QuestionEqual:
          y_satisfaction,y_dissatisfaction  = dataset[dataset[feature]== values].label.values,dataset[dataset[feature]!= values].label.values

        else:
          y_satisfaction, y_dissatisfaction = dataset[dataset[feature] >= values].label.values, dataset[dataset[feature] < values].label.values

        mean_impurity = (len(y_dissatisfaction)/N)*self.impurity(y_dissatisfaction) + (len(y_satisfaction)/N)*self.impurity(y_satisfaction)
        info_gain = self._information_gain(current_gain,mean_impurity)
        print(Q_type)
        question =  Q_type(feature,values,info_gain)

        return question if question > best_question else best_question


### K-Nearest Neighbors


In [48]:
class KNNClassifier(BinaryClassifier):

    def __init__(self,ohe_feature:list[str] ,max_k:int=None,N_batch=100) -> None:
        super().__init__()
        if max_k is not None:
          self.K = self._to_odd_number( max_k-1)
        else:
          self.K = self._to_odd_number(round(max_k**0.5))
        self.N_batch = N_batch
        self.ohe_features =  ohe_feature
        self.ohe_func = cp.vectorize(self._to_one_hot_encoding)
      

    def fit(self,X_train:pd.DataFrame,Y_train:pd.DataFrame):
        self.X  = X_train
        self.Y = Y_train
        self.x_num, self.x_ohe = self._split(X_train)

    def _to_odd_number(self, val):
        return val-1 if val%2 == 0 else val

    def _split(self,df:pd.DataFrame):
        return df.drop(self.ohe_features),df[self.ohe_features]

    def predict(self,x_test,y_test):
      dataframes_indices = self._predict(self._split(x_test))
      df_distances = pd.DataFrame(pd.concat(dataframes_indices).apply(self._prevote,axis=1))
      self.Y_Pred = df_distances.label.apply(self._vote_majority).values
      del dataframes_indices, df_distances
      collect()
      self._compute_analysis(y_test)

    def _prevote(row):
      return [int(Y_train[i]) for i in  row.tolist()]

    def _predict(self,test:tuple):
      test_x_num, test_x_ohe = test
      N =len(test_x_num)
      fold_size =  N / self.N_batch
      dataframes_indices = []
      #sleep(100)
      for i in tqdm(range(self.N_batch)):
        cp.get_default_memory_pool().free_all_blocks()
        a,b= round(fold_size*i),round(fold_size*(i+1))
        num,ohe = test_x_num[a:b],test_x_ohe[a:b]
        temp = self._compute_distance(num,self.x_num) + self.ohe_func(self._compute_distance(ohe,self.x_ohe))
        top_500_indices = cp.argsort(temp, axis=1)[:, :self.K] # TODO check give the label now
        dataframes_indices.append(pd.DataFrame(top_500_indices.get()))
        del temp, top_500_indices
        collect()
        #print(f'\nCalculating distance of the max k in Batch: {i+1}/{self.N_batch}')
        sleep(0.1)
      return dataframes_indices

    def _to_one_hot_encoding(x):
      return 0 if x == 0 else 1

    def _compute_distance(a,b):
      A = a.to_numpy(dtype='float32')
      B = b.to_numpy(dtype='float32')
      A = cp.asarray(A)
      B = cp.asarray(B)
      A_sq_norms = cp.sum(A ** 2, axis=1).reshape(-1, 1)  # Shape (n, 1)
      B_sq_norms = cp.sum(B ** 2, axis=1).reshape(1, -1)  # Shape (1, m)

      dot_product = cp.dot(A, B.T)  # Shape (n, m)
      euclidean_distances = A_sq_norms + B_sq_norms - 2 * dot_product
      del A_sq_norms,B_sq_norms, dot_product, A,B
      collect()
      return euclidean_distances

    def _vote_majority(self,label_vectors):
      n = len(label_vectors)
      sum_one = label_vectors.count(1)
      sum_zero =n-sum_one
      self.Y_PredProba.append(sum_zero/n)
      return 1 if sum_one > len(label_vectors)-sum_one else 0



## Training

In [50]:
test = DecisionTreeClassifier(8,.2,500)
test.fit(x_train,y_train)
test.train(x_val,y_val)
test.accuracy

<class '__main__.QuestionThresh'>
<class '__main__.QuestionThresh'>
<class '__main__.QuestionThresh'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class '__main__.QuestionEqual'>
<class 

TypeError: Question.split() missing 1 required positional argument: 'dataset'

## Testing


In [None]:
def test_model(model:BinaryClassifier):
    ...

## Conclusion
