## Proyecto 1 - IA

Libraries 

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.neighbors import KNeighborsClassifier

In [2]:
N_CLASSES = 10
CUTS = 10
PREDICT_V = 'ClassId'
FILE_NAME = f'.data/data_{N_CLASSES}_{CUTS}_cuts.csv'
data = pd.read_csv(FILE_NAME)
print(f'{FILE_NAME}')

./data_10_10_cuts.csv


## **DATA PREPROCESSING**

In [3]:
def get_normalize(data):
    X = data.iloc[:,:-2]        # feature columns
    X = (X-X.mean())/X.std()    # normalize data

    Y = data.iloc[:,-2:-1]      # predict variable column
    return X, Y

### **DATA REPORTING**

In [5]:
def data_reporting(X, Y):
    n, c = X.shape
    print(f'Training examples: {n}')
    for i in range(N_CLASSES):
        d0 = data[data[PREDICT_V] == i]
        r, c = d0.shape
        print(f'\tClass-id {i} : {r} \t{round(r*100/n)} %')

In [6]:
data_reporting(X, Y)

Training examples: 16237
	Class-id 0 : 195 	1 %
	Class-id 1 : 2158 	13 %
	Class-id 2 : 2466 	15 %
	Class-id 3 : 1599 	10 %
	Class-id 4 : 2239 	14 %
	Class-id 5 : 2210 	14 %
	Class-id 6 : 522 	3 %
	Class-id 7 : 1640 	10 %
	Class-id 8 : 1648 	10 %
	Class-id 9 : 1560 	10 %


In [7]:
def K_Fold_Validation(classifier, X, Y):
    # get K folds index
    folds = 10
    kf = KFold(n_splits=folds)
    kf.get_n_splits(X)
    k_fold_mean_score = 0

    model = classifier
    for train_index, test_index in kf.split(X):
        model = classifier
        model.fit(X.loc[train_index], Y.loc[train_index].values.ravel())
        predict = model.predict(X.loc[test_index])
        
        k_fold_mean_score += accuracy_score(Y.loc[test_index], predict)    

    k_fold_mean_score /= folds
    print(f'k-fold mean error:   {1 - k_fold_mean_score}')
    # print(f'k-fold mean score:   {k_fold_mean_score}')

In [8]:
def Bootstrap_Validation(classifier, X, Y, K=7):
    E = 10
    kf = KFold(n_splits=E)
    kf.get_n_splits(X)

    indexes = []
    for train_index, test_index in kf.split(X):
        indexes.append(np.array(test_index))
    indexes = np.array(indexes, dtype=object)
    bootstrap = 0
    for i in range(E):
        idx = np.random.choice(E, K, replace=True)
        not_idx = [i for i in range(E) if i not in idx]

        train_idx = np.concatenate(indexes[idx], axis=None)
        test_idx = np.concatenate(indexes[not_idx], axis=None)
        # print(idx, not_idx)

        model = classifier
        model.fit(X.loc[train_idx], Y.loc[train_idx].values.ravel())
        predict = model.predict(X.loc[test_idx])
        
        bootstrap += accuracy_score(Y.loc[test_idx], predict)
    bootstrap /= E
    print(f'bootstrap mean error:   {1 - bootstrap}')
    # print(f'bootstrap mean score:   {bootstrap}')

In [9]:
class Classifier():
    def __init__(self, model, dataset):
        self.file_name = dataset
        self.X, self.Y = get_normalize(pd.read_csv(dataset).sample(frac=1).reset_index(drop=True))
        self.model = model

    
    def experiment(self):
        print(f'\t {type(self.model)}')
        print(f'\t....Working on {self.file_name} .csv file')
        PRODUCTION_MODEL = self.model
        X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.3)
        PRODUCTION_MODEL.fit(X_train, Y_train.values.ravel())
        
        print(f'train error: {1 - PRODUCTION_MODEL.score(X_train, Y_train)}')
        print(f'test error: {1 - PRODUCTION_MODEL.score(X_test, Y_test)}')
        
        K_Fold_Validation(self.model, self.X, self.Y)
        Bootstrap_Validation(self.model, self.X, self.Y)
        
        # CONFUSSION MATRIX
        # cm = confusion_matrix(Y_test, PRODUCTION_MODEL.predict(X_test))

        # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=PRODUCTION_MODEL.classes_)
        # disp.plot()

        # cm_normalize = normalize(cm, norm='l1')
        # disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalize, display_labels=PRODUCTION_MODEL.classes_)
        # disp.plot()

        # # ACCURACY REPORT FOR EACH CLASS
        # arr = np.array(cm)
        # for i in range(N_CLASSES):
        #     print(f'\taccuracy on class {i}-th: {round(arr[i,i]/np.sum(arr[i,:]), 5)} %')
        


In [16]:
class_list = [10, 4]
cut_list = [10, 5, 4, 3, 2, 20]


# , LinearSVC(dual=False), SVC(), SVC(kernel='poly'), SVC(kernel='linear')
models = [KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', leaf_size=30)]   

for m in models:
    for c in class_list:
        for n_cut in cut_list:
            engine = Classifier(model=m, dataset=f'./data_{c}_{n_cut}_cuts.csv')
            engine.experiment()
            break
        break

	 <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
	....Working on ./data_10_10_cuts.csv .csv file
train error: 0.17976242850857893
