In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import RocCurveDisplay, roc_curve




from sklearn.neighbors import KNeighborsClassifier as KNN_clf
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier



import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = pd.read_excel('dados_processados_TESTE_CONF.xlsx')
dataset.shape

In [None]:
dataset.head(1000)

In [None]:
dataset.dtypes

In [None]:
dataset_pre = pd.get_dummies(data=dataset,columns=['disciplina','polo','turma'])
dataset_pre['resultado'].replace(['reprovado', 'aprovado'],[0, 1], inplace=True)
dataset_pre['curso'].replace(['Técnico em Informática para Internet', 'Técnico em Finanças'],[0, 1], inplace=True)
dataset_pre = dataset_pre.fillna(value=0)

In [None]:
display(dataset_pre)

In [None]:
yC = dataset_pre['resultado']
yR = dataset_pre['nota_mf']
x = dataset_pre.drop(['resultado', 'nota_mf'], axis = 1)

x.shape, yC.shape, yR.shape

In [None]:
print(yC)

In [None]:
print(yR)

In [None]:
print(x)

In [None]:
kf = KFold(n_splits=10, random_state=30, shuffle=True)
kf.get_n_splits(x)

print(kf)

In [None]:
for i, (train_index, test_index) in enumerate(kf.split(x)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}, {len(train_index)}")
    print(f"  Test:  index={test_index}, {len(test_index)}")

# Classificação
Métodos de classificação a ser avaliados a partir do dataset para o target resultado

In [None]:
CLF_KNN = KNN_clf(n_neighbors = 5)
CLF_NN_MLP = MLPClassifier(hidden_layer_sizes=(256,256), activation='relu', batch_size=100,
                           solver='adam', verbose=True, max_iter=1)
CLF_DTree = DecisionTreeClassifier(random_state=2, max_depth=5)
CLF_NB = GaussianNB()
CLF_SVM = SVC(cache_size=1024)
CLF_RF = RandomForestClassifier(max_depth=3, random_state=5, criterion='entropy', n_jobs=-1)

Soma_acb_k = 0
Soma_acb_m = 0
Soma_acb_d = 0
Soma_acb_n = 0
Soma_acb_S = 0
Soma_acb_R = 0

Soma_rec_k = 0
Soma_rec_m = 0
Soma_rec_d = 0
Soma_rec_n = 0
Soma_rec_S = 0
Soma_rec_R = 0

Soma_acc_k = 0
Soma_acc_m = 0
Soma_acc_d = 0
Soma_acc_n = 0
Soma_acc_S = 0
Soma_acc_R = 0

Soma_f1_k = 0
Soma_f1_m = 0
Soma_f1_d = 0
Soma_f1_n = 0
Soma_f1_S = 0
Soma_f1_R = 0

Soma_prec_k = 0
Soma_prec_m = 0
Soma_prec_d = 0
Soma_prec_n = 0
Soma_prec_S = 0
Soma_prec_R = 0

Soma_conmtx_k = 0
Soma_conmtx_m = 0
Soma_conmtx_d = 0
Soma_conmtx_n = 0
Soma_conmtx_S = 0
Soma_conmtx_R = 0

Soma_CurvaROC = 0

In [None]:
# Fitting the model
for i, (train_index, test_index) in enumerate(kf.split(x)):

    CLF_KNN.fit(x.iloc[train_index], yC.iloc[train_index])
    CLF_NN_MLP.fit(x.iloc[train_index], yC.iloc[train_index])
    CLF_DTree.fit(x.iloc[train_index], yC.iloc[train_index])
    CLF_NB.fit(x.iloc[train_index], yC.iloc[train_index])
    CLF_SVM.fit(x.iloc[train_index], yC.iloc[train_index])
    CLF_RF.fit(x.iloc[train_index], yC.iloc[train_index])

    # Predicting over the Train Set and calculating F1
    KNN_clf_predict = CLF_KNN.predict(x.iloc[test_index])
    NN_MLP_clf_predict = CLF_NN_MLP.predict(x.iloc[test_index])
    DTree_clf_predict = CLF_DTree.predict(x.iloc[test_index])
    NB_clf_predict = CLF_NB.predict(x.iloc[test_index])
    SVM_clf_predict = CLF_SVM.predict(x.iloc[test_index])
    RF_clf_predict = CLF_RF.predict(x.iloc[test_index])

    rec_k = recall_score(KNN_clf_predict, yC.iloc[test_index])
    rec_m = recall_score(NN_MLP_clf_predict, yC.iloc[test_index])
    rec_d = recall_score(DTree_clf_predict, yC.iloc[test_index])
    rec_n = recall_score(NB_clf_predict, yC.iloc[test_index])
    rec_S = recall_score(SVM_clf_predict, yC.iloc[test_index])
    rec_R = recall_score(RF_clf_predict, yC.iloc[test_index])

    acb_k = balanced_accuracy_score(KNN_clf_predict, yC.iloc[test_index])
    acb_m = balanced_accuracy_score(NN_MLP_clf_predict, yC.iloc[test_index])
    acb_d = balanced_accuracy_score(DTree_clf_predict, yC.iloc[test_index])
    acb_n = balanced_accuracy_score(NB_clf_predict, yC.iloc[test_index])
    acb_S = balanced_accuracy_score(SVM_clf_predict, yC.iloc[test_index])
    acb_R = balanced_accuracy_score(RF_clf_predict, yC.iloc[test_index])

    acc_k = accuracy_score(KNN_clf_predict, yC.iloc[test_index])
    acc_m = accuracy_score(NN_MLP_clf_predict, yC.iloc[test_index])
    acc_d = accuracy_score(DTree_clf_predict, yC.iloc[test_index])
    acc_n = accuracy_score(NB_clf_predict, yC.iloc[test_index])
    acc_S = accuracy_score(SVM_clf_predict, yC.iloc[test_index])
    acc_R = accuracy_score(RF_clf_predict, yC.iloc[test_index])

    f1_k = f1_score(KNN_clf_predict, yC.iloc[test_index])
    f1_m = f1_score(NN_MLP_clf_predict, yC.iloc[test_index])
    f1_d = f1_score(DTree_clf_predict, yC.iloc[test_index])
    f1_n = f1_score(NB_clf_predict, yC.iloc[test_index])
    f1_S = f1_score(SVM_clf_predict, yC.iloc[test_index])
    f1_R = f1_score(RF_clf_predict, yC.iloc[test_index])

    prec_k = precision_score(KNN_clf_predict, yC.iloc[test_index])
    prec_m = precision_score(NN_MLP_clf_predict, yC.iloc[test_index])
    prec_d = precision_score(DTree_clf_predict, yC.iloc[test_index])
    prec_n = precision_score(NB_clf_predict, yC.iloc[test_index])
    prec_S = precision_score(SVM_clf_predict, yC.iloc[test_index])
    prec_R = precision_score(RF_clf_predict, yC.iloc[test_index])

    conmtx_k = confusion_matrix(KNN_clf_predict, yC.iloc[test_index])
    conmtx_m = confusion_matrix(NN_MLP_clf_predict, yC.iloc[test_index])
    conmtx_d = confusion_matrix(DTree_clf_predict, yC.iloc[test_index])
    conmtx_n = confusion_matrix(NB_clf_predict, yC.iloc[test_index])
    conmtx_S = confusion_matrix(SVM_clf_predict, yC.iloc[test_index])
    conmtx_R = confusion_matrix(RF_clf_predict, yC.iloc[test_index])




    Soma_rec_k += rec_k
    Soma_rec_m += rec_m
    Soma_rec_d += rec_d
    Soma_rec_n += rec_n
    Soma_rec_S += rec_S
    Soma_rec_R += rec_R

    Soma_acb_k += acb_k
    Soma_acb_m += acb_m
    Soma_acb_d += acb_d
    Soma_acb_n += acb_n
    Soma_acb_S += acb_S
    Soma_acb_R += acb_R

    Soma_acc_k += acc_k
    Soma_acc_m += acc_m
    Soma_acc_d += acc_d
    Soma_acc_n += acc_n
    Soma_acc_S += acc_S
    Soma_acc_R += acc_R

    Soma_f1_k += f1_k
    Soma_f1_m += f1_m
    Soma_f1_d += f1_d
    Soma_f1_n += f1_n
    Soma_f1_S += f1_S
    Soma_f1_R += f1_R

    Soma_prec_k += prec_k
    Soma_prec_m += prec_m
    Soma_prec_d += prec_d
    Soma_prec_n += prec_n
    Soma_prec_S += prec_S
    Soma_prec_R += prec_R

    Soma_conmtx_k += conmtx_k
    Soma_conmtx_m += conmtx_m
    Soma_conmtx_d += conmtx_d
    Soma_conmtx_n += conmtx_n
    Soma_conmtx_S += conmtx_R
    Soma_conmtx_R += conmtx_S

print('\n')

print(f'Recall Média (KNN): {Soma_rec_k/(i+1)}')
print(f'Recall Média (RNA): {Soma_rec_m/(i+1)}')
print(f'Recall Média (DTree): {Soma_rec_d/(i+1)}')
print(f'Recall Média (NB): {Soma_rec_n/(i+1)}')
print(f'Recall Média (SVM): {Soma_rec_S/(i+1)}')
print(f'Recall Média (RF): {Soma_rec_R/(i+1)}')

print('\n')

print(f'Acurácia Balanceada Média (KNN): {Soma_acb_k/(i+1)}')
print(f'Acurácia Balanceada Média (RNA): {Soma_acb_m/(i+1)}')
print(f'Acurácia Balanceada Média (DTree): {Soma_acb_d/(i+1)}')
print(f'Acurácia Balanceada Média (NB): {Soma_acb_n/(i+1)}')
print(f'Acurácia Balanceada Média (SVM): {Soma_acb_S/(i+1)}')
print(f'Acurácia Balanceada Média (RF): {Soma_acb_R/(i+1)}')

print('\n')

print(f'Acurácia Média (KNN): {Soma_acc_k/(i+1)}')
print(f'Acurácia Média (RNA): {Soma_acc_m/(i+1)}')
print(f'Acurácia Média (DTree): {Soma_acc_d/(i+1)}')
print(f'Acurácia Média (NB): {Soma_acc_n/(i+1)}')
print(f'Acurácia Média (SVM): {Soma_acc_S/(i+1)}')
print(f'Acurácia Média (RF): {Soma_acc_R/(i+1)}')

print('\n')

print(f'f1_score Média (KNN): {Soma_f1_k/(i+1)}')
print(f'f1_score Média (RNA): {Soma_f1_m/(i+1)}')
print(f'f1_score Média (DTree): {Soma_f1_d/(i+1)}')
print(f'f1_score Média (NB): {Soma_f1_n/(i+1)}')
print(f'f1_score Média (SVM): {Soma_f1_S/(i+1)}')
print(f'f1_score Média (RF): {Soma_f1_R/(i+1)}')

print('\n')

print(f'precision_score Média (KNN): {Soma_prec_k/(i+1)}')
print(f'precision_score Média (RNA): {Soma_prec_m/(i+1)}')
print(f'precision_score Média (DTree): {Soma_prec_d/(i+1)}')
print(f'precision_score Média (NB): {Soma_prec_n/(i+1)}')
print(f'precision_score Média (SVM): {Soma_prec_S/(i+1)}')
print(f'precision_score Média (RF): {Soma_prec_R/(i+1)}')

print('\n')

n_classes = ['Falso', 'Verdadeiro']

print(f'confusion_matrix Média (KNN):')
ConfusionMatrixDisplay(Soma_conmtx_k, display_labels=n_classes).plot()
print('\n')
print(f'confusion_matrix Média (RNA):')
ConfusionMatrixDisplay(Soma_conmtx_m, display_labels=n_classes).plot()
print('\n')
print(f'confusion_matrix Média (DTree):')
ConfusionMatrixDisplay(Soma_conmtx_d, display_labels=n_classes).plot()
print('\n')
print(f'confusion_matrix Média (NB):')
ConfusionMatrixDisplay(Soma_conmtx_n, display_labels=n_classes).plot()
print('\n')
print(f'confusion_matrix Média (SVM):')
ConfusionMatrixDisplay(Soma_conmtx_S, display_labels=n_classes).plot()
print('\n')
print(f'confusion_matrix Média (RF):')
ConfusionMatrixDisplay(Soma_conmtx_R, display_labels=n_classes).plot()


# Regressão
Métodos de regressão a ser avaliados a partir do dataset para o target nota_mf

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [None]:

knn_reg = KNeighborsRegressor(n_neighbors=5)

tree_Reg = DecisionTreeRegressor(random_state=2, max_depth=5)

rfr_reg = RandomForestRegressor(max_depth=3, random_state=5, n_jobs=-1)

svm_Reg = SVR(cache_size=1024)

mlp_reg = MLPRegressor(hidden_layer_sizes=(256,256), activation='relu', batch_size=100,
                           solver='adam', verbose=True, max_iter=1)

lr_reg = LogisticRegression(solver='sag', n_jobs=-1)


vRegr_R2_KNN = []
vRegr_R2_Tree = []
vRegr_R2_rfr = []
vRegr_R2_SVM = []
vRegr_R2_MLP = []
vRegr_R2_LR = []

vRegr_MAE_KNN = []
vRegr_MAE_Tree = []
vRegr_MAE_rfr = []
vRegr_MAE_SVM = []
vRegr_MAE_MLP = []
vRegr_MAE_LR = []

vRegr_MAPE_KNN = []
vRegr_MAPE_Tree = []
vRegr_MAPE_rfr = []
vRegr_MAPE_SVM = []
vRegr_MAPE_MLP = []
vRegr_MAPE_LR = []

vRegr_MSE_KNN = []
vRegr_MSE_Tree = []
vRegr_MSE_rfr = []
vRegr_MSE_SVM = []
vRegr_MSE_MLP = []
vRegr_MSE_LR = []

In [None]:
# Fitting the model
for i, (train_index, test_index) in enumerate(kf.split(x)):

    knn_reg.fit(x.iloc[train_index], yR.iloc[train_index])
    tree_Reg.fit(x.iloc[train_index], yR.iloc[train_index])
    rfr_reg.fit(x.iloc[train_index], yR.iloc[train_index])
    svm_Reg.fit(x.iloc[train_index], yR.iloc[train_index])
    mlp_reg.fit(x.iloc[train_index], yR.iloc[train_index])
    lr_reg.fit(x.iloc[train_index], yR.iloc[train_index])

    # Predicting
    KnnRegpred = knn_reg.predict(x.iloc[test_index])
    treeRegPred = tree_Reg.predict(x.iloc[test_index])
    rfrRegPred = rfr_reg.predict(x.iloc[test_index])
    svmRegPred = svm_Reg.predict(x.iloc[test_index])
    mlpRegPred = mlp_reg.predict(x.iloc[test_index])
    lrRegPred = lr_reg.predict(x.iloc[test_index])


    #vRegressionKNN.append(r2_score(x.iloc[test_index], yR.iloc[test_index]))
    vRegr_R2_KNN.append(r2_score(yR.iloc[test_index], KnnRegpred))
    vRegr_R2_Tree.append(r2_score(yR.iloc[test_index], treeRegPred))
    vRegr_R2_rfr.append(r2_score(yR.iloc[test_index], rfrRegPred))
    vRegr_R2_SVM.append(r2_score(yR.iloc[test_index], svmRegPred))
    vRegr_R2_MLP.append(r2_score(yR.iloc[test_index], mlpRegPred))
    vRegr_R2_LR.append(r2_score(yR.iloc[test_index], lrRegPred))

    vRegr_MAE_KNN.append(mean_absolute_error(yR.iloc[test_index], KnnRegpred))
    vRegr_MAE_Tree.append(mean_absolute_error(yR.iloc[test_index], treeRegPred))
    vRegr_MAE_rfr.append(mean_absolute_error(yR.iloc[test_index], rfrRegPred))
    vRegr_MAE_SVM.append(mean_absolute_error(yR.iloc[test_index], svmRegPred))
    vRegr_MAE_MLP.append(mean_absolute_error(yR.iloc[test_index], mlpRegPred))
    vRegr_MAE_LR.append(mean_absolute_error(yR.iloc[test_index], lrRegPred))

    vRegr_MAPE_KNN.append(mean_absolute_percentage_error(yR.iloc[test_index], KnnRegpred))
    vRegr_MAPE_Tree.append(mean_absolute_percentage_error(yR.iloc[test_index], treeRegPred))
    vRegr_MAPE_rfr.append(mean_absolute_percentage_error(yR.iloc[test_index], rfrRegPred))
    vRegr_MAPE_SVM.append(mean_absolute_percentage_error(yR.iloc[test_index], svmRegPred))
    vRegr_MAPE_MLP.append(mean_absolute_percentage_error(yR.iloc[test_index], mlpRegPred))
    vRegr_MAPE_LR.append(mean_absolute_percentage_error(yR.iloc[test_index], lrRegPred))


    vRegr_MSE_KNN.append(mean_squared_error(yR.iloc[test_index], KnnRegpred,squared=False))
    vRegr_MSE_Tree.append(mean_squared_error(yR.iloc[test_index], treeRegPred,squared=False))
    vRegr_MSE_rfr.append(mean_squared_error(yR.iloc[test_index], rfrRegPred,squared=False))
    vRegr_MSE_SVM.append(mean_squared_error(yR.iloc[test_index], svmRegPred,squared=False))
    vRegr_MSE_MLP.append(mean_squared_error(yR.iloc[test_index], mlpRegPred,squared=False))
    vRegr_MSE_LR.append(mean_squared_error(yR.iloc[test_index], lrRegPred,squared=False))

    

print('\n')
print('Score R2(KNN):', np.mean(vRegr_R2_KNN))
print('Score R2(TREE):', np.mean(vRegr_R2_Tree))
print('Score R2(RFR):', np.mean(vRegr_R2_rfr))
print('Score R2(SVM):', np.mean(vRegr_R2_SVM))
print('Score R2(MLP):', np.mean(vRegr_R2_MLP))
print('Score R2(LR):', np.mean(vRegr_R2_LR))
print('\n')
print('Score MAE(KNN):', np.mean(vRegr_MAE_KNN))
print('Score MAE(TREE):', np.mean(vRegr_MAE_Tree))
print('Score MAE(RFR):', np.mean(vRegr_MAE_rfr))
print('Score MAE(SVM):', np.mean(vRegr_MAE_SVM))
print('Score MAE(MLP):', np.mean(vRegr_MAE_MLP))
print('Score MAE(LR):', np.mean(vRegr_MAE_LR))
print('\n')
print('Score MAPE(KNN):', format(np.mean(vRegr_MAPE_KNN*100)))
print('Score MAPE(TREE):', np.mean(vRegr_MAPE_Tree))
print('Score MAPE(RFR):', np.mean(vRegr_MAPE_rfr))
print('Score MAPE(SVM):', np.mean(vRegr_MAPE_SVM))
print('Score MAPE(MLP):', np.mean(vRegr_MAPE_MLP))
print('Score MAPE(LR):', np.mean(vRegr_MAPE_LR))
print('\n')
print('Score MSE(KNN):', np.mean(vRegr_MSE_KNN))
print('Score MSE(TREE):', np.mean(vRegr_MSE_Tree))
print('Score MSE(RFR):', np.mean(vRegr_MSE_rfr))
print('Score MSE(SVM):', np.mean(vRegr_MSE_SVM))
print('Score MSE(MLP):', np.mean(vRegr_MSE_MLP))
print('Score MSE(LR):', np.mean(vRegr_MSE_LR))
