In [14]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.neighbors import KNeighborsClassifier

In [None]:
N_CLASSES = 10
CUTS = 10
PREDICT_V = 'ClassId'
FILE_NAME = f'./data_{N_CLASSES}_{CUTS}_cuts.csv'
data = pd.read_csv(FILE_NAME)
print(f'{FILE_NAME}')

## **DATA PREPROCESSING**

In [9]:
def get_normalize(data):
    X = data.iloc[:,:-2]        # feature columns
    X = (X-X.mean())/X.std()    # normalize data

    Y = data.iloc[:,-2:-1]      # predict variable column
    return X, Y

In [None]:
X, Y = get_normalize(data)

### **DATA REPORTING**

In [3]:
def data_reporting(X, Y):
    n, c = X.shape
    print(f'Training examples: {n}')
    for i in range(N_CLASSES):
        d0 = data[data[PREDICT_V] == i]
        r, c = d0.shape
        print(f'\tClass-id {i} : {r} \t{round(r*100/n)} %')

In [None]:
# data_reporting(X, Y)

In [4]:
def K_Fold_Validation(classifier, X, Y):
    # get K folds index
    folds = 10
    kf = KFold(n_splits=folds)
    kf.get_n_splits(X)
    k_fold_mean_score = 0

    model = classifier
    for train_index, test_index in kf.split(X):
        model = classifier
        model.fit(X.loc[train_index], Y.loc[train_index].values.ravel())
        predict = model.predict(X.loc[test_index])
        
        k_fold_mean_score += accuracy_score(Y.loc[test_index], predict)    

    k_fold_mean_score /= folds
    print(f'k-fold mean error:   {1 - k_fold_mean_score}')
    print(f'k-fold mean score:   {k_fold_mean_score}')

In [6]:
def Bootstrap_Validation(classifier, X, Y, K=7):
    E = 10
    kf = KFold(n_splits=E)
    kf.get_n_splits(X)

    indexes = []
    for train_index, test_index in kf.split(X):
        indexes.append(np.array(test_index))
    indexes = np.array(indexes, dtype=object)
    bootstrap = 0
    for i in range(E):
        idx = np.random.choice(E, K, replace=True)
        not_idx = [i for i in range(E) if i not in idx]

        train_idx = np.concatenate(indexes[idx], axis=None)
        test_idx = np.concatenate(indexes[not_idx], axis=None)
        # print(idx, not_idx)

        model = classifier
        model.fit(X.loc[train_idx], Y.loc[train_idx].values.ravel())
        predict = model.predict(X.loc[test_idx])
        
        bootstrap += accuracy_score(Y.loc[test_idx], predict)
    bootstrap /= E
    print(f'bootstrap mean error:   {1 - bootstrap}')
    print(f'bootstrap mean score:   {bootstrap}')

In [18]:
class Classifier():
    def __init__(self, model, dataset):
        self.file_name = dataset
        self.X, self.Y = get_normalize(pd.read_csv(dataset).sample(frac=1).reset_index(drop=True))
        self.model = model

    
    def experiment(self):
        print(f'\t {type(self.model)}')
        print(f'\t....Working on {self.file_name} .csv file')
        PRODUCTION_MODEL = self.model
        X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.3)
        PRODUCTION_MODEL.fit(X_train, Y_train.values.ravel())

        print(f'accuracy score: {PRODUCTION_MODEL.score(X_test, Y_test)}')
        
        K_Fold_Validation(self.model, self.X, self.Y)
        Bootstrap_Validation(self.model, self.X, self.Y)
        # CONFUSSION MATRIX
        # cm = confusion_matrix(Y_test, PRODUCTION_MODEL.predict(X_test))

        # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=PRODUCTION_MODEL.classes_)
        # disp.plot()

        # cm_normalize = normalize(cm, norm='l1')
        # disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalize, display_labels=PRODUCTION_MODEL.classes_)
        # disp.plot()

        # # ACCURACY REPORT FOR EACH CLASS
        # arr = np.array(cm)
        # for i in range(N_CLASSES):
        #     print(f'\taccuracy on class {i}-th: {round(arr[i,i]/np.sum(arr[i,:]), 5)} %')
        


In [None]:
model = LinearSVC(dual=False)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
model.fit(X_train, Y_train.values.ravel())
acc_score = model.score(X_test, Y_test)

In [None]:
print(f'accuracy score: {acc_score}')

In [None]:
# confussion matrix
cm = confusion_matrix(Y_test, model.predict(X_test))

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

cm_normalize = normalize(cm, norm='l1')
disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalize, display_labels=model.classes_)
disp.plot()

# accuracy report for each class
arr = np.array(cm)
for i in range(N_CLASSES):
    print(f'\taccuracy on class {i}-th: {round(arr[i,i]/np.sum(arr[i,:]), 5)} %')

In [None]:
K_Fold_Validation(LinearSVC(dual=False), X, Y)
Bootstrap_Validation(LinearSVC(dual=False), X, Y)

In [None]:
data = data.sample(frac=1).reset_index(drop=True)
X = data.iloc[:,:-2]        # feature columns
Y = data.iloc[:,-2:-1]      # predict variable column

# normalize data
X = (X-X.mean())/X.std()

In [None]:
K_Fold_Validation(LinearSVC(dual=False), X, Y)
Bootstrap_Validation(LinearSVC(dual=False), X, Y)

In [20]:
class_list = [5, 10]
cut_list = [20, 10, 5, 4, 3, 2]

models = [KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', leaf_size=30), LinearSVC(dual=False), SVC(kernel='poly'), SVC(kernel='linear')]

for m in models:
    for c in class_list:
        for n_cut in cut_list:
            engine = Classifier(model=m, dataset=f'./data_{c}_{n_cut}_cuts.csv')
            engine.experiment()
            break
        break

	 <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
	....Working on ./data_5_20_cuts.csv .csv file
accuracy score: 0.7501924557351809
k-fold mean error:   0.21323499179003869
k-fold mean score:   0.7867650082099613
bootstrap mean error:   0.28006894054086184
bootstrap mean score:   0.7199310594591382
	 <class 'sklearn.svm._classes.LinearSVC'>
	....Working on ./data_5_20_cuts.csv .csv file
accuracy score: 0.5354118552732872
k-fold mean error:   0.4595106062021921
k-fold mean score:   0.5404893937978079
bootstrap mean error:   0.4632363185485373
bootstrap mean score:   0.5367636814514627
	 <class 'sklearn.svm._classes.SVC'>
	....Working on ./data_5_20_cuts.csv .csv file
accuracy score: 0.43033102386451116
k-fold mean error:   0.5568902268085277
k-fold mean score:   0.44310977319147227
bootstrap mean error:   0.5625644242105409
bootstrap mean score:   0.43743557578945913
	 <class 'sklearn.svm._classes.SVC'>
	....Working on ./data_5_20_cuts.csv .csv file
accuracy score: 0.58