# Validação das tarefas de aprendizado

## Importações

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import friedmanchisquare
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, GridSearchCV, ShuffleSplit, KFold
from sklearn.metrics import accuracy_score, make_scorer, mean_squared_error

## Parâmetros gerais

In [2]:
n_splits = 10
random_state = 17

base = pd.read_csv('student-por.csv', sep = ";")

---

## Tarefa de classificação

### Função de pré-processamento

In [3]:
def preprocess_class(base):

    base = base.drop(['school'], axis=1)
    
    base = base.replace(['LE3', 'GT3'], [0,1])
    
    items = ['sex', 'address','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup','paid','activities','nursery','higher',
             'internet','romantic']
    
    for item in items:
        base = pd.concat([base,pd.get_dummies(base[item], prefix=item)],axis=1)
        base = base.drop([item],axis=1)
    
    base.loc[base['G3'] < 10, 'G3'] = 0
    base.loc[base['G3'] >= 10, 'G3'] = 1
    
    return base

### Função de treinamento e teste com base_class nos algoritmos escolhidos

In [4]:
def search_class(X, y, n_splits, random_state, algorithms):
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
    perf = accuracy_score
    
    score = {}
    for algorithm in algorithms.keys():
        score[algorithm] = []
    
    for algorithm, (clf, parameters) in algorithms.items():
        for train, test in kf.split(X, y):
            prep = StandardScaler()
            prep.fit(X[train])
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=make_scorer(perf))
            best.fit(prep.transform(X[train]), y[train])
            score[algorithm].append(perf(best.predict(prep.transform(X[test])), y[test]))
            
    df_metrics = pd.DataFrame.from_dict(score)
    return df_metrics

### Função que executa o teste de Friedman e Nemenyi (se necessário)

In [5]:
def friedman_nemenyi_tests_class(df_metrics, title):   
    
    df_score = df_metrics.rank(axis=1, ascending=False)
    
    stat, p = friedmanchisquare(*[grp for idx, grp in df_score.iteritems()])
    
    print('Statistics=%.3f, p=%g' % (stat, p))
    
    alpha = 0.05
    if p > alpha:
        print('Same distributions (fail to reject H0)')
    else:
        print("Different distributions (reject H0)")
        names = list(df_score.columns)
        avranks = df_score.mean().values.tolist()
        cd = Orange.evaluation.compute_CD(avranks, n_splits)
        Orange.evaluation.graph_ranks(avranks, names, cd=cd, width=6, textspace=1.5)
        plt.title(title)
        plt.show()
    return df_score

### Algoritmos e hiperparâmetros testados

In [6]:
algorithms_class = {
    "MLP": (MLPClassifier(), { "activation": ("identity", "logistic", "tanh", "relu"), "solver": ("lbfgs", "adam"), "max_iter": [1000], "random_state": [1]}), 
    "SVM": (SVC(), {"C": [1, 10], "kernel": ("linear", "rbf"), "random_state": [2]}),
    "KNN": (KNeighborsClassifier(), { "n_neighbors": [1, 3, 5] }),
    "DT" : (DecisionTreeClassifier(), { "criterion": ("gini", "entropy"), "max_depth": [5, 10, 20], "random_state": [3]}),
    "RF" : (RandomForestClassifier(), { "criterion": ("gini", "entropy"), "max_depth": [5, 10, 20], "n_estimators": [30,50,100], "random_state": [4]})
}

### Leitura da base_class, pré-processamento e definição do atributo-alvo

In [7]:
base_class = preprocess_class(base)

y = base_class['G3'].to_numpy()

### Combinação [  ]

#### Busca dos melhores modelos

In [8]:
X = base_class.drop(columns = ['G1','G2','G3']).to_numpy()
df_metrics_no_class = search_class(X,y,n_splits,random_state,algorithms_class)

###### Performance dos modelos

In [9]:
df_metrics_no_class

Unnamed: 0,MLP,SVM,KNN,DT,RF
0,0.830769,0.815385,0.8,0.876923,0.846154
1,0.861538,0.876923,0.830769,0.846154,0.830769
2,0.815385,0.876923,0.846154,0.8,0.861538
3,0.846154,0.861538,0.876923,0.830769,0.846154
4,0.861538,0.861538,0.830769,0.907692,0.846154
5,0.769231,0.846154,0.753846,0.815385,0.861538
6,0.861538,0.861538,0.846154,0.876923,0.861538
7,0.830769,0.861538,0.815385,0.846154,0.830769
8,0.861538,0.846154,0.815385,0.846154,0.830769
9,0.859375,0.84375,0.875,0.890625,0.859375


In [10]:
df_metrics_no_class.mean()

MLP    0.839784
SVM    0.855144
KNN    0.829038
DT     0.853678
RF     0.847476
dtype: float64

In [11]:
df_metrics_no_class.std()

MLP    0.029913
SVM    0.018210
KNN    0.036312
DT     0.034003
RF     0.013239
dtype: float64

#### Teste de Friedman e Nemenyi (se necessário)

In [12]:
df_score_no_class = friedman_nemenyi_tests_class(df_metrics_no_class, "[ ]")

Statistics=7.474, p=0.112875
Same distributions (fail to reject H0)


### Combinação [G1]

#### Busca dos melhores modelos

In [None]:
X = base_class.drop(columns = ['G2','G3']).to_numpy()
df_metrics_G1_class = search_class(X,y,n_splits,random_state,algorithms_class)

###### Performance dos modelos

In [None]:
df_metrics_G1_class

In [None]:
df_metrics_G1_class.mean()

In [None]:
df_metrics_G1_class.std()

#### Teste de Friedman e Nemenyi (se necessário)

In [None]:
df_score_G1_class = friedman_nemenyi_tests_class(df_metrics_G1_class, "[G1]")

### Combinação [G2]

#### Busca dos melhores modelos

In [None]:
X = base_class.drop(columns = ['G1','G3']).to_numpy()
df_metrics_G2_class = search_class(X,y,n_splits,random_state,algorithms_class)

##### Performance dos modelos

In [None]:
df_metrics_G2_class

In [None]:
df_metrics_G2_class.mean()

In [None]:
df_metrics_G2_class.std()

#### Teste de Friedman e Nemenyi (se necessário)

In [None]:
df_score_G2_class = friedman_nemenyi_tests_class(df_metrics_G2_class, "[G2]")

### Combinação [G1, G2]

#### Busca dos melhores modelos

In [None]:
X = base_class.drop(columns = ['G3']).to_numpy()
df_metrics_G1_G2_class = search_class(X,y,n_splits,random_state,algorithms_class)

##### Performance dos modelos

In [None]:
df_metrics_G1_G2_class

In [None]:
df_metrics_G1_G2_class.mean()

In [None]:
df_metrics_G1_G2_class.std()

#### Teste de Friedman e Nemenyi (se necessário)

In [None]:
df_score_G1_G2_class = friedman_nemenyi_tests_class(df_metrics_G1_G2_class, "[G1, G2]")

---

## Tarefas de Regressão

### Função de pré-processamento

In [None]:
def preprocess_regr(base):

    base = base.drop(['school'], axis=1)
    base = base.replace(['LE3', 'GT3'], [0,1])
    items = ['sex', 'address','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup','paid','activities','nursery','higher',
             'internet','romantic']
    
    for item in items:
        base = pd.concat([base,pd.get_dummies(base[item], prefix=item)],axis=1)
        base = base.drop([item],axis=1)
        
    return base

### Função de treinamento e teste com base nos algoritmos escolhidos

In [None]:
def search_regr(X, y, n_splits, random_state, algorithms, title):
    kf = KFold(n_splits=10, shuffle=True, random_state=random_state)
    gskf = KFold(n_splits=3, shuffle=True, random_state=random_state)
    perf = mean_squared_error
    
    score = {}
    for algorithm in algorithms.keys():
        score[algorithm] = []
    
    for algorithm, (clf, parameters) in algorithms.items():
        for train, test in kf.split(X, y):
            prep = StandardScaler()
            
            prep.fit(X[train])
            
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=make_scorer(perf))
            best.fit(prep.transform(X[train]), y[train])
            score[algorithm].append(perf(best.predict(prep.transform(X[test])), y[test]))
            
    df_metrics = pd.DataFrame.from_dict(score)
    return df_metrics

### Função que executa o teste de Friedman e Nemenyi (se necessário)

In [None]:
def friedman_nemenyi_tests_regr(df_metrics, title):   
    
    df_score = df_metrics.rank(axis=1, ascending=True)
    
    stat, p = friedmanchisquare(*[grp for idx, grp in df_score.iteritems()])
    
    print('Statistics=%.3f, p=%g' % (stat, p))
    
    alpha = 0.05
    if p > alpha:
        print('Same distributions (fail to reject H0)')
    else:
        print("Different distributions (reject H0)")
        names = list(df_score.columns)
        avranks = df_score.mean().values.tolist()
        cd = Orange.evaluation.compute_CD(avranks, n_splits)
        Orange.evaluation.graph_ranks(avranks, names, cd=cd, width=6, textspace=1.5)
        plt.title(title)
        plt.show()
    return df_score

### Algoritmos e hiperparâmetros testados

In [None]:
algorithms_regr = {
    "MLP": (MLPRegressor(), { "activation": ["identity", "tanh", "relu"], "solver": ["lbfgs", "adam"], "max_iter": [1500], "random_state": [1]}),
    "SVM": (SVR(), {"C": [1.0], "kernel": ("linear", "rbf", "poly", "sigmoid")}),
    "KNN": (KNeighborsRegressor(), { "n_neighbors": [1, 3, 5] }),
    "DT" : (DecisionTreeRegressor(), { "criterion": ("mse", "friedman_mse", "mae"), "max_depth": [5, 10, 20], "random_state": [3]}),
    "RF" : (RandomForestRegressor(), { "criterion": ("mse", "mae"), "max_depth": [5, 10, 20], "n_estimators": [30,50,100], "random_state": [4]})
}

### Leitura da base_regr, pré-processamento e definição do atributo-alvo

In [None]:
base_regr = preprocess_regr(base)

y = base_regr['G3'].to_numpy()

### Combinação [  ]

#### Busca dos melhores modelos

In [None]:
X = base_regr.drop(columns = ['G3', 'G1', 'G2']).to_numpy()
df_G3_regr = search_regr(X,y,n_splits,random_state,algorithms_regr, "G3 from []")

##### Performance dos modelos

In [None]:
df_G3_regr

In [None]:
df_G3_regr.mean()

In [None]:
df_G3_regr.std()

#### Teste de Friedman e Nemenyi (se necessário)

In [None]:
df_score_G3_regr = friedman_nemenyi_tests_regr(df_G3_regr, "[]")

### Combinação [G1]

#### Busca dos melhores modelos

In [None]:
X = base_regr.drop(columns = ['G3', 'G2']).to_numpy()
df_G3_from_G1_regr = search_regr(X,y,n_splits,random_state,algorithms_regr, "G3 from [G1]")

##### Performance dos modelos

In [None]:
df_G3_from_G1_regr

In [None]:
df_G3_from_G1_regr.mean()

In [None]:
df_G3_from_G1_regr.std()

#### Teste de Friedman e Nemenyi (se necessário)

In [None]:
df_score_G3_from_G1_regr = friedman_nemenyi_tests_regr(df_G3_from_G1_regr, "[G1]")

### Combinação [G2]

#### Busca dos melhores modelos

In [None]:
X = base_regr.drop(columns = ['G3', 'G1']).to_numpy()
df_G3_from_G2_regr = search_regr(X,y,n_splits,random_state,algorithms_regr, "G3 from [G2]")

##### Performance dos modelos

In [None]:
df_G3_from_G2_regr

In [None]:
df_G3_from_G2_regr.mean()

In [None]:
df_G3_from_G2_regr.std()

#### Teste de Friedman e Nemenyi (se necessário)

In [None]:
df_score_G3_from_G2_regr = friedman_nemenyi_tests_regr(df_G3_from_G2_regr, "[G2]")

### Combinação [G1, G2]

#### Busca dos melhores modelos

In [None]:
X = base_regr.drop(columns = ['G3']).to_numpy()
df_G3_from_G1_G2_regr = search_regr(X,y,n_splits,random_state,algorithms_regr, "G3 from [G1, G2]")

##### Performance dos modelos

In [None]:
df_G3_from_G1_G2_regr

In [None]:
df_G3_from_G1_G2_regr.mean()

In [None]:
df_G3_from_G1_G2_regr.std()

#### Teste de Friedman e Nemenyi (se necessário)

In [None]:
df_score_G3_from_G1_G2_regr = friedman_nemenyi_tests_regr(df_G3_from_G1_G2_regr, "[G1, G2]")