# Phase 1

Original k-NN implementation based on code found in https://github.com/rushter/MLAlgorithms.

In [4]:
import numpy as np


class CART(object):
    def __init__(self, tree = 'cls', criterion = 'gini', prune = 'depth', max_depth = 4, min_criterion = 0.05):
        self.feature = None
        self.label = None
        self.n_samples = None
        self.gain = None
        self.left = None
        self.right = None
        self.threshold = None
        self.depth = 0

        self.root = None
        self.criterion = criterion
        self.prune = prune
        self.max_depth = max_depth
        self.min_criterion = min_criterion
        self.tree = tree

    def fit(self, features, target):
        self.root = CART()
        if(self.tree == 'cls'):
            self.root._grow_tree(features, target, self.criterion)
        else:
            self.root._grow_tree(features, target, 'mse')
        self.root._prune(self.prune, self.max_depth, self.min_criterion, self.root.n_samples)

    def predict(self, features):
        return np.array([self.root._predict(f) for f in features])

    def print_tree(self):
        self.root._show_tree(0, ' ')

    def _grow_tree(self, features, target, criterion = 'gini'):
        self.n_samples = features.shape[0] 

        if len(np.unique(target)) == 1:
            self.label = target[0]
            return

        best_gain = 0.0
        best_feature = None
        best_threshold = None

        if criterion in {'gini', 'entropy'}:
            self.label = max([(c, len(target[target == c])) for c in np.unique(target)], key = lambda x : x[1])[0]
        else:
            self.label = np.mean(target)

        impurity_node = self._calc_impurity(criterion, target)
        
        for col in range(features.shape[1]):
            feature_level = np.unique(features[:,col])
            thresholds = (feature_level[:-1] + feature_level[1:]) / 2.0

            for threshold in thresholds:
                target_l = target[features[:,col] <= threshold]
                impurity_l = self._calc_impurity(criterion, target_l)
                n_l = float(target_l.shape[0]) / self.n_samples

                target_r = target[features[:,col] > threshold]
                impurity_r = self._calc_impurity(criterion, target_r)
                n_r = float(target_r.shape[0]) / self.n_samples

                impurity_gain = impurity_node - (n_l * impurity_l + n_r * impurity_r)
                if impurity_gain > best_gain:
                    best_gain = impurity_gain
                    best_feature = col
                    best_threshold = threshold

        self.feature = best_feature
        self.gain = best_gain
        self.threshold = best_threshold
        self._split_tree(features, target, criterion)

    def _split_tree(self, features, target, criterion):
        features_l = features[features[:, self.feature] <= self.threshold]
        target_l = target[features[:, self.feature] <= self.threshold]
        self.left = CART()
        self.left.depth = self.depth + 1
        self.left._grow_tree(features_l, target_l, criterion)

        features_r = features[features[:, self.feature] > self.threshold]
        target_r = target[features[:, self.feature] > self.threshold]
        self.right = CART()
        self.right.depth = self.depth + 1
        self.right._grow_tree(features_r, target_r, criterion)

    def _calc_impurity(self, criterion, target):
        if criterion == 'gini':
            return 1.0 - sum([(float(len(target[target == c])) / float(target.shape[0])) ** 2.0 for c in np.unique(target)])
        elif criterion == 'mse':
            return np.mean((target - np.mean(target)) ** 2.0)
        else:
            entropy = 0.0
            for c in np.unique(target):
                p = float(len(target[target == c])) / target.shape[0]
                if p > 0.0:
                    entropy -= p * np.log2(p)
            return entropy            

    def _prune(self, method, max_depth, min_criterion, n_samples):
        if self.feature is None:
            return

        self.left._prune(method, max_depth, min_criterion, n_samples)
        self.right._prune(method, max_depth, min_criterion, n_samples)

        pruning = False

        if method == 'impurity' and self.left.feature is None and self.right.feature is None: 
            if (self.gain * float(self.n_samples) / n_samples) < min_criterion:
                pruning = True
        elif method == 'depth' and self.depth >= max_depth:
            pruning = True

        if pruning is True:
            self.left = None
            self.right = None
            self.feature = None

    def _predict(self, d):
        if self.feature != None:
            if d[self.feature] <= self.threshold:
                return self.left._predict(d)
            else:
                return self.right._predict(d)
        else: 
            return self.label

    def _show_tree(self, depth, cond):
        base = '    ' * depth + cond
        if self.feature != None:
            print(base + 'if X[' + str(self.feature) + '] <= ' + str(self.threshold))
            self.left._show_tree(depth+1, 'then ')
            self.right._show_tree(depth+1, 'else ')
        else:
            print(base + '{value: ' + str(self.label) + ', samples: ' + str(self.n_samples) + '}')


import os
import pandas as pd
from sklearn.model_selection import train_test_split

base_dir = "data/raw/class_imbalance"
for dataset in os.listdir(base_dir):
    file_path = os.path.join(base_dir, dataset)
    df = pd.read_csv(file_path)
    print(dataset)
    newdf = df.dropna(axis=0, how='any')

    # Load preprocessed data
    X = newdf.iloc[:, :-1]
    y = newdf.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

    # Initialize and train model
    tree = CART(tree = 'cls', criterion = 'entropy', prune = 'depth', max_depth = 3)
    tree.fit(X_train, y_train)
    predictions = tree.predict(X_test)
    accuracy = np.mean(predictions == y_test)
    print(f"Accuracy: {accuracy * 100:.2f}%")

dataset_978_mfeat-factors.csv


InvalidIndexError: (slice(None, None, None), 0)

In [None]:
The following code works only for datasets with categorical

In [63]:
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split
from scipy.spatial import distance
import os 
import pandas as pd 
import numpy as np
from mla import knn
from mla.metrics.metrics import accuracy

base_dir = "data/raw/class_imbalance"
for filename in os.listdir(base_dir):
    print(filename)
    file_path = os.path.join(base_dir, filename)
    df = pd.read_csv(file_path)

    # Insert 0 in rows with NaN values
    newdf = df.fillna(0)
    #print("data length:", len(df))
    #print("new data length:", len(newdf))
    X = newdf.iloc[:, :-1]
    y = newdf.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

    clf = knn.KNNClassifier(k=5, distance_func=distance.euclidean)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(f"classification accuracy {accuracy(y_test, predictions)}\n")


dataset_978_mfeat-factors.csv
classification accuracy 0.9916666666666667

dataset_947_arsenic-male-bladder.csv
classification accuracy 0.9642857142857143

dataset_1004_synthetic_control.csv
classification accuracy 1.0

dataset_1056_mc1.csv
classification accuracy 0.9940140845070422

dataset_940_water-treatment.csv
classification accuracy 0.8490566037735849

dataset_950_arsenic-female-lung.csv
classification accuracy 0.9702380952380952

dataset_1014_analcatdata_dmft.csv


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [62]:
base_dir = "data/raw/class_imbalance"
for filename in os.listdir(base_dir):
    print(filename)
    file_path = os.path.join(base_dir, filename)
    df = pd.read_csv(file_path)

    # Insert 0 in rows with NaN values
    newdf = df.fillna(0)
    #print("data length:", len(df))
    #print("new data length:", len(newdf))
    X = newdf.iloc[:, :-1]
    y = newdf.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

    clf = knn.KNNClassifier(k=5, distance_func=distance.hamming)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(f"classification accuracy {accuracy(y_test, predictions)}\n")

dataset_978_mfeat-factors.csv
classification accuracy 0.9866666666666667

dataset_947_arsenic-male-bladder.csv
classification accuracy 0.9761904761904762

dataset_1004_synthetic_control.csv
classification accuracy 0.8277777777777777

dataset_1056_mc1.csv
classification accuracy 0.9943661971830986

dataset_940_water-treatment.csv
classification accuracy 0.8176100628930818

dataset_950_arsenic-female-lung.csv
classification accuracy 0.9821428571428571

dataset_1014_analcatdata_dmft.csv
classification accuracy 0.725

dataset_1039_hiva_agnostic.csv
classification accuracy 0.9637509850275807

dataset_1018_ipums_la_99-small.csv


KeyboardInterrupt: 

In [64]:
base_dir = "class_imbalance"
for filename in os.listdir(base_dir):
    print(filename)

dataset_978_mfeat-factors.csv
dataset_947_arsenic-male-bladder.csv
dataset_1004_synthetic_control.csv
dataset_1056_mc1.csv
dataset_940_water-treatment.csv
dataset_950_arsenic-female-lung.csv
dataset_1039_hiva_agnostic.csv
dataset_1045_kc1-top5.csv
dataset_1013_analcatdata_challenger.csv
dataset_450_analcatdata_lawsuit.csv
dataset_312_scene.csv
dataset_995_mfeat-zernike.csv
dataset_311_oil_spill.csv
dataset_980_optdigits.csv
dataset_987_collins.csv
dataset_1061_ar4.csv
dataset_962_mfeat-morphological.csv
dataset_951_arsenic-male-lung.csv
dataset_958_segment.csv
dataset_1064_ar6.csv
dataset_1050_pc3.csv
dataset_463_backache.csv
dataset_1049_pc4.csv
dataset_316_yeast_ml8.csv
dataset_949_arsenic-female-bladder.csv
dataset_1022_mfeat-pixel.csv
dataset_954_spectrometer.csv
dataset_971_mfeat-fourier.csv
dataset_1065_kc3.csv
dataset_1021_page-blocks.csv
dataset_1020_mfeat-karhunen.csv
dataset_1059_ar1.csv
dataset_984_analcatdata_draft.csv
dataset_976_JapaneseVowels.csv
