# Introduction to Machine Learning classification problem

- Part 1: Data Preprocessing uisng NumPy only
- Part 2: Import DecisionTree made from scratch using NumPy
- Part 3: Training and Testing phase

 ## Part 1: Data Preprocessing using NumPy only

In [121]:
import numpy as np

In [122]:
# read data from csv file
data = np.genfromtxt(fname='data/data.csv', dtype=str, delimiter=',', skip_header=1)
data

array([['0001_01', 'Europa', 'False', ..., '0.0', 'Maham Ofracculy',
        'False'],
       ['0002_01', 'Earth', 'False', ..., '44.0', 'Juanna Vines', 'True'],
       ['0003_01', 'Europa', 'False', ..., '49.0', 'Altark Susent',
        'False'],
       ...,
       ['9279_01', 'Earth', 'False', ..., '0.0', 'Fayey Connon', 'True'],
       ['9280_01', 'Europa', 'False', ..., '3235.0', 'Celeon Hontichre',
        'False'],
       ['9280_02', 'Europa', 'False', ..., '12.0', 'Propsh Hontichre',
        'True']], dtype='<U18')

In [123]:
# check data shape to have a rough idea of the data we are dealing with
data.shape

(8693, 14)

In [124]:
# store label column in y
y = data[:,-1]
y

array(['False', 'True', 'False', ..., 'True', 'False', 'True'],
      dtype='<U18')

In [125]:
# label encode y (True - 1, False - 0)
y = np.where(y=='True', 1., 0.)
y

array([0., 1., 0., ..., 1., 0., 1.])

In [126]:
# have a look at data from each column
for j in range(data.shape[1]):
    print(data[:5,j])

['0001_01' '0002_01' '0003_01' '0003_02' '0004_01']
['Europa' 'Earth' 'Europa' 'Europa' 'Earth']
['False' 'False' 'False' 'False' 'False']
['B/0/P' 'F/0/S' 'A/0/S' 'A/0/S' 'F/1/S']
['TRAPPIST-1e' 'TRAPPIST-1e' 'TRAPPIST-1e' 'TRAPPIST-1e' 'TRAPPIST-1e']
['39.0' '24.0' '58.0' '33.0' '16.0']
['False' 'False' 'True' 'False' 'False']
['0.0' '109.0' '43.0' '0.0' '303.0']
['0.0' '9.0' '3576.0' '1283.0' '70.0']
['0.0' '25.0' '0.0' '371.0' '151.0']
['0.0' '549.0' '6715.0' '3329.0' '565.0']
['0.0' '44.0' '49.0' '193.0' '2.0']
['Maham Ofracculy' 'Juanna Vines' 'Altark Susent' 'Solam Susent'
 'Willy Santantines']
['False' 'True' 'False' 'False' 'True']


In [127]:
# drop the label column
X = data[:,:-1]
for j in range(X.shape[1]):
    print(X[:5,j])

['0001_01' '0002_01' '0003_01' '0003_02' '0004_01']
['Europa' 'Earth' 'Europa' 'Europa' 'Earth']
['False' 'False' 'False' 'False' 'False']
['B/0/P' 'F/0/S' 'A/0/S' 'A/0/S' 'F/1/S']
['TRAPPIST-1e' 'TRAPPIST-1e' 'TRAPPIST-1e' 'TRAPPIST-1e' 'TRAPPIST-1e']
['39.0' '24.0' '58.0' '33.0' '16.0']
['False' 'False' 'True' 'False' 'False']
['0.0' '109.0' '43.0' '0.0' '303.0']
['0.0' '9.0' '3576.0' '1283.0' '70.0']
['0.0' '25.0' '0.0' '371.0' '151.0']
['0.0' '549.0' '6715.0' '3329.0' '565.0']
['0.0' '44.0' '49.0' '193.0' '2.0']
['Maham Ofracculy' 'Juanna Vines' 'Altark Susent' 'Solam Susent'
 'Willy Santantines']


In [128]:
# some of the features are categorical variables
# check which feature can be encoded and which feature we cannot
for j in range(X.shape[1]):
    x = X[:,j]
    print(f"col{j} - unique: {len(np.unique(x))}, value: {x[0]}")

col0 - unique: 8693, value: 0001_01
col1 - unique: 4, value: Europa
col2 - unique: 3, value: False
col3 - unique: 6561, value: B/0/P
col4 - unique: 4, value: TRAPPIST-1e
col5 - unique: 81, value: 39.0
col6 - unique: 3, value: False
col7 - unique: 1274, value: 0.0
col8 - unique: 1508, value: 0.0
col9 - unique: 1116, value: 0.0
col10 - unique: 1328, value: 0.0
col11 - unique: 1307, value: 0.0
col12 - unique: 8474, value: Maham Ofracculy


In [129]:
# random: I found out that this result in number, so take note
float('0001_01')

101.0

In [130]:
# drop the Id feature because it is just an identifier
X = X[:,1:]

for j in range(X.shape[1]):
    x = X[:,j]
    print(f"col{j} - unique: {len(np.unique(x))}, value: {x[0]}")

col0 - unique: 4, value: Europa
col1 - unique: 3, value: False
col2 - unique: 6561, value: B/0/P
col3 - unique: 4, value: TRAPPIST-1e
col4 - unique: 81, value: 39.0
col5 - unique: 3, value: False
col6 - unique: 1274, value: 0.0
col7 - unique: 1508, value: 0.0
col8 - unique: 1116, value: 0.0
col9 - unique: 1328, value: 0.0
col10 - unique: 1307, value: 0.0
col11 - unique: 8474, value: Maham Ofracculy


In [131]:
# create an array of unique, mode feature pairs
uni_ftrs = np.empty((0,2))
for j in range(X.shape[1]):
    x = X[:,j]
    uni_ftrs = np.append(uni_ftrs, np.reshape(np.array([len(np.unique(x)), x[0]]), (1,2)), axis=0)
    # print(f"col{j} - unique: {len(np.unique(x))}, value: {x[0]}")
uni_ftrs

array([['4', 'Europa'],
       ['3', 'False'],
       ['6561', 'B/0/P'],
       ['4', 'TRAPPIST-1e'],
       ['81', '39.0'],
       ['3', 'False'],
       ['1274', '0.0'],
       ['1508', '0.0'],
       ['1116', '0.0'],
       ['1328', '0.0'],
       ['1307', '0.0'],
       ['8474', 'Maham Ofracculy']], dtype='<U32')

In [132]:
# we do not care about numerical features for now
# I will set the boundary to 10 (can be anything)
# if unique >= 10 && feature is categorical, drop the feature
# but first, get a list of feature indices that we want to drop

col_idxs_todrop = np.empty((1,0))
i = 0
for uni, ftr in uni_ftrs:
    try:
        float(ftr)
        pass
    except ValueError:
        if float(uni) >= 10:
            col_idxs_todrop = np.append(col_idxs_todrop, np.array([i]))
    i+=1
    # print(i)
col_idxs_todrop

array([ 2., 11.])

In [133]:
# drop those features
X = np.delete(X, col_idxs_todrop.astype(int), axis=1)
X.shape

(8693, 10)

In [134]:
#double check
for j in range(X.shape[1]):
    print(X[:5,j])

['Europa' 'Earth' 'Europa' 'Europa' 'Earth']
['False' 'False' 'False' 'False' 'False']
['TRAPPIST-1e' 'TRAPPIST-1e' 'TRAPPIST-1e' 'TRAPPIST-1e' 'TRAPPIST-1e']
['39.0' '24.0' '58.0' '33.0' '16.0']
['False' 'False' 'True' 'False' 'False']
['0.0' '109.0' '43.0' '0.0' '303.0']
['0.0' '9.0' '3576.0' '1283.0' '70.0']
['0.0' '25.0' '0.0' '371.0' '151.0']
['0.0' '549.0' '6715.0' '3329.0' '565.0']
['0.0' '44.0' '49.0' '193.0' '2.0']


In [135]:
# now that we are left with features we want to use...
# let us first check if there are any missing values (99% of the time there will be)
for j in range(X.shape[1]):
    print(np.unique(X[:,j]))

['' 'Earth' 'Europa' 'Mars']
['' 'False' 'True']
['' '55 Cancri e' 'PSO J318.5-22' 'TRAPPIST-1e']
['' '0.0' '1.0' '10.0' '11.0' '12.0' '13.0' '14.0' '15.0' '16.0' '17.0'
 '18.0' '19.0' '2.0' '20.0' '21.0' '22.0' '23.0' '24.0' '25.0' '26.0'
 '27.0' '28.0' '29.0' '3.0' '30.0' '31.0' '32.0' '33.0' '34.0' '35.0'
 '36.0' '37.0' '38.0' '39.0' '4.0' '40.0' '41.0' '42.0' '43.0' '44.0'
 '45.0' '46.0' '47.0' '48.0' '49.0' '5.0' '50.0' '51.0' '52.0' '53.0'
 '54.0' '55.0' '56.0' '57.0' '58.0' '59.0' '6.0' '60.0' '61.0' '62.0'
 '63.0' '64.0' '65.0' '66.0' '67.0' '68.0' '69.0' '7.0' '70.0' '71.0'
 '72.0' '73.0' '74.0' '75.0' '76.0' '77.0' '78.0' '79.0' '8.0' '9.0']
['' 'False' 'True']
['' '0.0' '1.0' ... '994.0' '995.0' '999.0']
['' '0.0' '1.0' ... '9965.0' '997.0' '999.0']
['' '0.0' '1.0' ... '99.0' '991.0' '994.0']
['' '0.0' '1.0' ... '994.0' '995.0' '998.0']
['' '0.0' '1.0' ... '994.0' '995.0' '998.0']


In [136]:
# we will handle the missing values using Imputation by Mode
# basically replace the empty string ("") by the mode of respective feature
# the function is defined as follow
# note that sometimes the empty string ("") is the mode,
# so a conditional statement is made to handle this case

def mode_at_col(col):
    vals, counts = np.unique(col, return_counts=True)
    sorted_idxs = np.argsort(counts)
    if vals[sorted_idxs[-1]] == '':
        return vals[sorted_idxs[-2]]
    else:
        return vals[sorted_idxs[-1]]

In [137]:
# test mode_at_col with dummy data
tmp = np.array(['1','','','2','2','1','','3','','2','4',''])
mode_at_col(tmp)

'2'

In [138]:
# to have an insight why handling missing values is important, 
# see how many missing values we have
for j in range(X.shape[1]):
    X_j = X[:,j]
    print(np.unique(X_j=='', return_counts=True))

(array([False,  True]), array([8492,  201]))
(array([False,  True]), array([8476,  217]))
(array([False,  True]), array([8511,  182]))
(array([False,  True]), array([8514,  179]))
(array([False,  True]), array([8490,  203]))
(array([False,  True]), array([8512,  181]))
(array([False,  True]), array([8510,  183]))
(array([False,  True]), array([8485,  208]))
(array([False,  True]), array([8510,  183]))
(array([False,  True]), array([8505,  188]))


In [139]:
# replace '' with the corresponding mode
for j in range(X.shape[1]):
    X_j = X[:,j]
    X_j[X_j==''] = mode_at_col(X_j)
    
# now let us check again
for j in range(X.shape[1]):
    X_j = X[:,j]
    print(np.unique(X_j=='', return_counts=True))

(array([False]), array([8693]))
(array([False]), array([8693]))
(array([False]), array([8693]))
(array([False]), array([8693]))
(array([False]), array([8693]))
(array([False]), array([8693]))
(array([False]), array([8693]))
(array([False]), array([8693]))
(array([False]), array([8693]))
(array([False]), array([8693]))


In [140]:
# nice! now we got rid of missing values
np.unique(X=='')

array([False])

In [141]:
# now we gonna have to one-hot encode categorical feature 
# one-hot encode function is defined as follows
# it converts a column of categorical feature to 2d np array

def one_hot_encode(col):
    unique_vals = np.unique(col)
    ohe = np.zeros((len(col), len(unique_vals)))
    for i in range(len(col)):
        ohe[i][unique_vals==col[i]]=1
    return ohe

# random: i got some weird stuff towards the end when using np.empty, 
# 0 becomes very small value that is not 0 for some reason (not sure yet why)

In [142]:
# as usual, test it first
tmp = np.array(['a','a','c','b','b'])
tmp2 = one_hot_encode(tmp)
tmp2

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [143]:
# nice, the function works!
# now convert our string categorical column (meh) 
# to float 2d np array (yay)

X_num = np.empty((X.shape[0],0))
for j in range(X.shape[1]):
    X_j = X[:,j]
    try:
        X_j = np.reshape(X_j.astype(float), (X_j.astype(float).shape[0],1))
        X_num = np.append(X_num, X_j, axis=1)
    except ValueError:
        X_num = np.append(X_num, one_hot_encode(X_j), axis=1)

In [144]:
# double check
for j in range(X_num.shape[1]):
    print(X_num[:5,j])

[0. 1. 0. 0. 1.]
[1. 0. 1. 1. 0.]
[0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1.]
[39. 24. 58. 33. 16.]
[1. 1. 0. 1. 1.]
[0. 0. 1. 0. 0.]
[  0. 109.  43.   0. 303.]
[   0.    9. 3576. 1283.   70.]
[  0.  25.   0. 371. 151.]
[   0.  549. 6715. 3329.  565.]
[  0.  44.  49. 193.   2.]


In [145]:
# and check the shape
# think about why this is expected
X_num.shape

(8693, 16)

In [146]:
# I will use X from now on
X = X_num
for j in range(X.shape[1]):
    print(X[:5,j])

[0. 1. 0. 0. 1.]
[1. 0. 1. 1. 0.]
[0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1.]
[39. 24. 58. 33. 16.]
[1. 1. 0. 1. 1.]
[0. 0. 1. 0. 0.]
[  0. 109.  43.   0. 303.]
[   0.    9. 3576. 1283.   70.]
[  0.  25.   0. 371. 151.]
[   0.  549. 6715. 3329.  565.]
[  0.  44.  49. 193.   2.]


In [147]:
# now we have X and y, if we were to have a model now, we can split and train and test!
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (8693, 16), y shape: (8693,)


In [148]:
# now that we have our dataset in a good format, we can split it
# into training and testing

train_ratio = 0.8

# calculate number of samples for training and testing sets
n_samples = X.shape[0]
train_size = int(train_ratio * n_samples)
test_size = n_samples - train_size

# shuffle the data
idxs = np.random.permutation(n_samples)
X = X[idxs]
y = y[idxs]

# split the data into training and testing sets
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# print the shape of each set
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# End of Part 1

X_train shape: (6954, 16)
y_train shape: (6954,)
X_test shape: (1739, 16)
y_test shape: (1739,)


 ## Part 2: Import DecisionTree made from scratch using NumPy


In [149]:
# see how DecisionTree is implemented in decision_tree.py file

from decision_tree import DecisionTree

 ## Part 3: Training and Testing phase

In [150]:
# initialised the DecisionTree
# set the maximum depth and minimum samples per node
# max_depth = 8 gave the best performance

dt = DecisionTree(max_depth=8, min_samples=10)

# training
dt.fit(X_train, y_train)

# testing
y_pred = dt.predict(X_test)

# accuracy function
def acc(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

acc = acc(y_test, y_pred)
print(f"Accuracy : {acc}")

# End of Part 3

Accuracy : 0.7883841288096607
