## **Proyecto 1 - IA**

**Libraries** 

In [98]:
%matplotlib inline


import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.neighbors import KNeighborsClassifier

original_stdout = sys.stdout

## **DATA PREPROCESSING**

In [99]:
def get_normalize(data):
    X = data.iloc[:,:-2]        # feature columns
    X = (X-X.mean())/X.std()    # normalize data

    Y = data.iloc[:,-2:-1]      # predict variable column
    return X, Y

### **DATA ABSTRACT**

In [100]:
PREDICT_V = 'ClassId'
def data_abstract(Y):
    n_classes = Y[PREDICT_V].unique()
    n, c = Y.shape
    print(f'Training examples: {n}')
    for i in range(len(n_classes)):
        d = Y[Y[PREDICT_V] == i]
        r, c = d.shape
        print(f'\tClass-id {i} : {r} \t{round(r*100/n)} %')

In [101]:
def K_Fold_Validation(classifier, X, Y):
    # get K folds index
    folds = 10
    kf = KFold(n_splits=folds)
    kf.get_n_splits(X)
    k_fold_mean_score = 0

    model = classifier
    for train_index, test_index in kf.split(X):
        model = classifier
        model.fit(X.loc[train_index], Y.loc[train_index].values.ravel())
        predict = model.predict(X.loc[test_index])
        
        k_fold_mean_score += accuracy_score(Y.loc[test_index], predict)    

    k_fold_mean_score /= folds
    print(f'k-fold mean error:   {1 - k_fold_mean_score}')
    # print(f'k-fold mean score:   {k_fold_mean_score}')

In [102]:
def Bootstrap_Validation(classifier, X, Y, K=7):
    E = 10
    kf = KFold(n_splits=E)
    kf.get_n_splits(X)

    indexes = []
    for train_index, test_index in kf.split(X):
        indexes.append(np.array(test_index))
    indexes = np.array(indexes, dtype=object)
    bootstrap = 0
    for i in range(E):
        idx = np.random.choice(E, K, replace=True)
        not_idx = [i for i in range(E) if i not in idx]

        train_idx = np.concatenate(indexes[idx], axis=None)
        test_idx = np.concatenate(indexes[not_idx], axis=None)
        # print(idx, not_idx)

        model = classifier
        model.fit(X.loc[train_idx], Y.loc[train_idx].values.ravel())
        predict = model.predict(X.loc[test_idx])
        
        bootstrap += accuracy_score(Y.loc[test_idx], predict)
    bootstrap /= E
    print(f'bootstrap mean error:   {1 - bootstrap}')
    # print(f'bootstrap mean score:   {bootstrap}')

In [103]:
class Classifier():
    def __init__(self, tup, dataset, n, c):
        self.file_name = dataset
        self.n_classes = n
        self.n_cuts = c
        self.X, self.Y = get_normalize(pd.read_csv(dataset).sample(frac=1).reset_index(drop=True))
        self.enc, self.model = tup
        

    def experiment(self):
        print(f'{type(self.model)}')
        print(f'\t....Working on {self.file_name} .csv file')
        PRODUCTION_MODEL = self.model
        X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.3)
        PRODUCTION_MODEL.fit(X_train, Y_train.values.ravel())
        
        print(f'train error: {1 - PRODUCTION_MODEL.score(X_train, Y_train)}')
        print(f'test error: {1 - PRODUCTION_MODEL.score(X_test, Y_test)}')
        
        # K-FOLD and BOOTSTRAP VALIDATION
        K_Fold_Validation(self.model, self.X, self.Y)
        Bootstrap_Validation(self.model, self.X, self.Y)
        
        # CONFUSSION MATRIX
        cm = confusion_matrix(Y_test, PRODUCTION_MODEL.predict(X_test))
        # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=PRODUCTION_MODEL.classes_)
        # disp.plot()

        cm_normalize = normalize(cm, norm='l1')
        # disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalize, display_labels=PRODUCTION_MODEL.classes_)
        # disp.plot()
        
        plt.figure()
        ax = sns.heatmap(np.array(cm_normalize), annot=True, cmap='Blues')
        plt.savefig(f'./plots/{self.enc}_{self.n_classes}_{n_cut}_cuts.png', dpi=500)
        plt.clf()
        

        # ACCURACY REPORT FOR EACH CLASS
        arr = np.array(cm)
        for i in range(self.n_classes):
            print(f'\taccuracy on class {i}-th: {round(arr[i,i]/np.sum(arr[i,:])*100, 5)} %')


In [104]:
class_list = [5, 10]
cut_list = [20, 10, 5, 4, 3, 2]

In [105]:
with open('./reports/data_abstract.txt', 'w') as f:
    sys.stdout = f
    for c in class_list:
        for n_cut in cut_list:
            file_name = f'./data/data_{c}_{n_cut}_cuts.csv'
            X, Y = get_normalize(pd.read_csv(file_name))
            data_abstract(Y)
            break
    sys.stdout = original_stdout
    f.close()

In [106]:
# , SVC(), SVC(kernel='poly'), SVC(kernel='linear')
models = [('KNN',KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', leaf_size=30)), ('L_SVM',LinearSVC(dual=False))]

for m in models:
    for c in class_list:
        for n_cut in cut_list:
            with open(f'./reports/{m[0]}_{c}_{n_cut}_cuts.txt', 'w') as f:
                sys.stdout = f
                engine = Classifier(tup=m, dataset=f'./data/data_{c}_{n_cut}_cuts.csv', n=c, c=n_cut)
                engine.experiment()
                sys.stdout = original_stdout
                f.close()