# Import of libraries

In [1]:
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [4]:
from IPython.display import Image

In [5]:
from sklearn import metrics as mt

In [6]:
import matplotlib.pyplot as plt

In [7]:
from sklearn import tree as tr

In [8]:
from six import StringIO

In [9]:
import seaborn as sns

In [10]:
import pandas as pd

In [11]:
import numpy as np

In [12]:
import pydotplus

In [13]:
import graphviz

In [14]:
import cv2

# Function definition

In [15]:
def join_data(x_df1, x_df2, y_df1, y_df2):
    
    """
    This function is responsible for joining the dataframes.

    Args:
        parameter1 (DataFrame): Receives a dataframe that will join with the x_df2.
        parameter2 (DataFrame): Receives a dataframe that will join with the x_df1.
        parameter3 (DataFrame): Receives a dataframe that will join with the y_df2.
        parameter4 (DataFrame): Receives a dataframe that will join with the y_df1.

    Returns:
        returns two dataframes.

    Example:
        df1
        df2
        df3
        df4
        x, y = join_data(df1, df2, df3, df4)
    """
    # Combining the data from x_df1 and x_df2, the DataFrames vertically, resetting the indices
    new_x = pd.concat([x_df1, x_df2], axis=0)
    new_x.reset_index(drop=True, inplace=True)

    # Combining the data from y_df1 and y_df2, the DataFrames vertically, resetting the indices
    new_y = pd.concat([y_df1, y_df2], axis=0)
    new_y.reset_index(drop=True, inplace=True)

    # Returns two dataframes
    return new_x, new_y

In [16]:
def get_metrics(y_real, y_pred, list_accuracy, list_precision, list_recall, list_f1Score):

    """
    This function is responsible for calculating the metrics: Accuracy, Precision, Recall and F1-Score.

    Args:
        parameter1 (DataFrame): represents the actual values.
        parameter2 (numpy.ndarray): represents the predicted values. 
        parameter3 (list): location where the Accuracy metrics will be stored
        parameter4 (list): location where the Precision metrics will be stored
        parameter5 (list): location where the Recall metrics will be stored
        parameter6 (list): location where the f1Score metrics will be stored

    Returns:
        There is no return.

    Example:
        get_metrics(y_real, y_pred, list_accuracy, list_precision, list_recall, list_f1Score)
    """
    
    # Accuracy
    accuracy = mt.accuracy_score(y_real, y_pred)
    list_accuracy.append(accuracy)

    # Precision
    precision = mt.precision_score(y_real, y_pred, average='binary')
    list_precision.append(precision)

    # Recall
    recall = mt.recall_score(y_real, y_pred, average='binary')
    list_recall.append(recall)

    # F1-Score
    f1Score = mt.f1_score(y_real, y_pred)
    list_f1Score.append(f1Score)

In [17]:
def show_table(list_names_algorithm, list_names_metrics, list_values_metrics):

    """
    This function is responsible for creating a table and displaying it.

    Args:
        parameter1 (list): Receives a list with the names of the algorithms worked on.
        parameter2 (list): Receives a list with the names of the defined metrics.
        parameter3 (list): Receives a list of lists with the resulting metric values. 
                           The size of that list must be equal to the size of the list of the first parameter, 
                           Otherwise, an error will occur.

    Returns:
        returns a table in dataframe format.

    Example:
        algorithm = ['A','B', 'C']
        metrics = ['aa', 'bb', 'cc']
        values = [[1], [2], [3]]
        show_table(algorithm, metrics, values)
    """
    
    # Create a dictionary = table
    tab = {}

    # Add elements to the dictionary by assigning a value to a specific key
    tab['Algorithm Name'] = list_names_algorithm

    # Error handling
    try:
        # Scrolling through the list
        for names in list_names_metrics:
            # Add elements to the dictionary by assigning a value to a specific key
            tab[names] = list_values_metrics[list_names_metrics.index(names)]

        # Create a DataFrame from data
        df_tab = pd.DataFrame(tab)
            
    except ValueError:
        
        print('Unable to execute this command! Check the size and type of the third parameter.')
        
    else:
        # Returns a dataframe
        return df_tab

# Loading data

In [18]:
x_train = pd.read_csv('../Datasets/ensaio_classificacao/X_training.csv')

In [19]:
y_train = pd.read_csv('../Datasets/ensaio_classificacao/Y_training.csv')

In [20]:
x_vld = pd.read_csv('../Datasets/ensaio_classificacao/X_validation.csv')

In [21]:
y_vld = pd.read_csv('../Datasets/ensaio_classificacao/Y_validation.csv')

In [22]:
x_test = pd.read_csv('../Datasets/ensaio_classificacao/X_test.csv')

In [23]:
y_test = pd.read_csv('../Datasets/ensaio_classificacao/Y_test.csv')

In [24]:
# Juntando os dados de treinamento com os dados de validação
x_train_vld, y_train_vld = join_data(x_train, x_vld, y_train, y_vld)

In [25]:
# Juntando os dados de teste com os dados juntados anteriormente
x, y = join_data(x_test, x_train_vld, y_test, y_train_vld)

In [26]:
# OBS1: os dados carregados já foram feitos as seleções de features e preparação de dados
# OBS2: .values - retorna um objeto view que contém os valores do dicionário, como uma lista 
#       .ravel() - retorna uma matriz 1D nivelada contígua
#       .values.ravel() = transformando em um array unidimensional

# Training Supervised Learning Algorithms for Classification

## KNN

### Para dados de treinamento

In [27]:
knn_classifier = KNeighborsClassifier()

In [28]:
knn_classifier.fit(x_train, y_train.values.ravel())

### Para dados de validação

In [29]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

In [30]:
# Usando os melhores parametros estou reitreinando o algoritmo com os dados de treinamento e validação juntos
knn_classifier.fit(x_train_vld, y_train_vld.values.ravel())

#### Obtendo o melhor parametro

In [None]:
# Valores de k
k_values = list(range(1, 11))

# Lista para armezenar os valores da acurácia
list_acuracia = []

# Lista para armezenar os valores da precision
list_precision_knn = []

# Lista para armezenar os valores da recall
list_recall_knn = []

# Lista para armezenar os valores da f1Score
list_f1Score_knn = []

for k in k_values:
    # Treinando o algoritmo
    knn_classifier = KNeighborsClassifier(n_neighbors=k).fit(x_train, y_train)
    # Classificação
    y_pred = knn_classifier.predict(x_vld)
    # Performance  
    accuracy_knn = mt.accuracy_score(y_vld, y_pred)
    list_acuracia.append(accuracy_knn)

    precision_knn = mt.precision_score(y_vld, y_pred, average='binary')
    list_precision_knn.append(precision_knn)

    recall_knn = mt.recall_score(y_vld, y_pred, average='binary')
    list_recall_knn.append(recall_knn)

    f1Score_knn = mt.f1_score(y_vld, y_pred)
    list_f1Score_knn.append(f1Score_knn)
    

In [None]:
# Plot da acurácia em relação a k
plt.plot(k_values, list_acuracia, marker='o', linestyle='-', label='Acurácia')
plt.plot(k_values, list_precision_knn, marker='o', linestyle='-', label='Precision')
plt.plot(k_values, list_recall_knn, marker='o', linestyle='-', label='Recall')
plt.plot(k_values, list_f1Score_knn, marker='o', linestyle='-', label='f1Score')

plt.xlabel('Valor de k')
plt.ylabel('Performance')
plt.legend()
plt.grid(True)
plt.show()

### Para os dados de teste

In [31]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

In [32]:
knn_classifier.fit(x, y.values.ravel())

## Decision Tree

### Para dados de treinamento

In [33]:
tree_clf = tr.DecisionTreeClassifier()

In [34]:
tree = tree_clf.fit(x_train, y_train)

### Para dados de validação

In [35]:
tree_clf = tr.DecisionTreeClassifier(max_depth=17)

In [36]:
tree = tree_clf.fit(x_train_vld, y_train_vld)

#### Obtendo o melhor parametro

In [None]:
# Carrega o arquivo .dot
with open("tree.dot", "r") as f:
    dot_graph = f.read()

# Converte o .dot em um objeto Graphviz
graph = pydotplus.graph_from_dot_data(dot_graph)

# Salva o gráfico como .png
graph.write_png("meu_grafico.png")

# Exibe o gráfico no notebook
Image("meu_grafico.png")

In [None]:
# Valores de d
d_values = list(range(17, 21))

# Lista para armazenar os valores da acurácia
list_acuracia_tree = []

# Lista para armazenar os valores da precision
list_precision_tree = []

# Lista para armazenar os valores da recall
list_recall_tree = []

# Lista para armazenar os valores da f1Score
list_f1Score_tree = []

# Lista para armazenar os valores da área sob a curva ROC
list_auc_tree = []

for d in d_values:
    # Treinando o algoritmo
    tree_clf = tr.DecisionTreeClassifier(max_depth=d).fit(x_train, y_train)
    
    # Classificação
    
    y_pred = tree_clf.predict(x_vld)

    y_pred_proba = tree_clf.predict_proba(x_vld)[:, 1]
    
    # Performance  
    accuracy_tree = mt.accuracy_score(y_vld, y_pred)
    list_acuracia_tree.append(accuracy_tree)

    precision_tree = mt.precision_score(y_vld, y_pred, average='binary')
    list_precision_tree.append(precision_tree)

    recall_tree = mt.recall_score(y_vld, y_pred, average='binary')
    list_recall_tree.append(recall_tree)

    f1Score_tree = mt.f1_score(y_vld, y_pred)
    list_f1Score_tree.append(f1Score_tree)

    # Para calcular a curva Precision, Recall e Threshold
    precision, recall, threshold = mt.precision_recall_curve(y_vld, y_pred_proba)

    # Para calcular a curva ROC
    fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred_proba)

    # Para calcular a área sob a curva ROC
    auc = mt.auc(fpr, tpr)
    list_auc_tree.append(auc)

In [None]:
# Plot da acurácia em relação a k
plt.plot(d_values, list_acuracia_tree, marker='o', linestyle='-', label='Acurácia')
plt.plot(d_values, list_precision_tree, marker='o', linestyle='-', label='Precision')
plt.plot(d_values, list_recall_tree, marker='o', linestyle='-', label='Recall')
plt.plot(d_values, list_f1Score_tree, marker='o', linestyle='-', label='f1Score')

plt.xlabel('Valor de max_depth')
plt.ylabel('Performance')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Para visualizar a Curva Precision vs Recall
plt.plot(recall, precision, marker='.', label='Model')
plt.xlabel("Recall")
plt.ylabel("Precision")

In [None]:
# Para visualizar a Curva Threshold, Precision x Recall
plt.plot( threshold, precision[:-1], 'b--', label='Precision' );
plt.plot( threshold, recall[:-1], 'g-', label='Recall' );
plt.xlabel( 'Thresholds' );
plt.ylabel( 'Precison, Recall' );
plt.legend()
plt.grid()

In [None]:
# Para visualizar a curva ROC
plt.plot(fpr, tpr, marker='.', label='Model(AUC=%0.2f)' % auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
# Para visualizar a curva Threshold, TPR x FPR
plt.plot(thresholds, tpr, 'b--', label='TPR')
plt.plot(thresholds, 1-fpr, 'g-', label='FPR')
plt.xlabel('True Positive Rate')
plt.ylabel('False Positive Rate')
plt.legend()
plt.grid()

### Para dados de teste

In [37]:
# tree_clf = tr.DecisionTreeClassifier(max_depth=17)

In [38]:
tree = tree_clf.fit(x, y)

### Visualização da árvore

In [None]:
# Export draw
dot_data = tr.export_graphviz(
    clf,
    out_file='tree.dot',
    feature_names=list(x_train.columns),
    class_names=list(str(y_train['0'].unique())),
    rounded=True,
    filled=True,
    special_characters=True
)

In [None]:
# Carregue o arquivo DOT
graph = graphviz.Source.from_file('tree.dot')

In [None]:
# Exiba a imagem no Jupyter Notebook
display(graph)

## Random Forest

### Para dados de treinamento

In [39]:
clf_rf = RandomForestClassifier()

In [40]:
rf = clf_rf.fit(x_train, y_train.values.ravel())

### Para dados de validação

In [41]:
clf_rf = RandomForestClassifier(n_estimators=7, max_depth=9)

In [42]:
rf = clf_rf.fit(x_train_vld, y_train_vld.values.ravel())

#### Obtendo o melhor parametro

In [None]:
# Valores de d - max_depth
d_values = list(range(1, 101))

# Lista para armazenar os valores da acurácia
list_acuracia_rf = []

# Lista para armazenar os valores da precision
list_precision_rf = []

# Lista para armazenar os valores da recall
list_recall_rf = []

# Lista para armazenar os valores da f1Score
list_f1Score_rf = []

# Lista para armazenar os valores da área sob a curva ROC
list_auc_rf = []

for d in d_values:
    # Treinando o algoritmo
    rf_clf = RandomForestClassifier(max_depth=d).fit(x_train, y_train)
    
    # Classificação
    y_pred_rf = rf_clf.predict(x_vld)
    y_pred_proba_rf = rf_clf.predict_proba(x_vld)[:, 1]
    
    # Performance  
    accuracy_rf = mt.accuracy_score(y_vld, y_pred_rf)
    list_acuracia_rf.append(accuracy_rf)

    precision_rf = mt.precision_score(y_vld, y_pred_rf, average='binary')
    list_precision_rf.append(precision_rf)

    recall_rf = mt.recall_score(y_vld, y_pred_rf, average='binary')
    list_recall_rf.append(recall_rf)

    f1Score_rf = mt.f1_score(y_vld, y_pred_rf)
    list_f1Score_rf.append(f1Score_rf)

    # Para calcular a curva Precision, Recall e Threshold
    precision, recall, threshold = mt.precision_recall_curve(y_vld, y_pred_proba_rf)

    # Para calcular a curva ROC
    fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred_proba_rf)

    # Para calcular a área sob a curva ROC
    auc = mt.auc(fpr, tpr)
    list_auc_rf.append(auc)

In [None]:
# Plot da acurácia em relação a d
plt.plot(n_values, list_acuracia_rf, marker='o', linestyle='-', label='Acurácia')
plt.plot(n_values, list_precision_rf, marker='o', linestyle='-', label='Precision')
plt.plot(n_values, list_recall_rf, marker='o', linestyle='-', label='Recall')
plt.plot(n_values, list_f1Score_rf, marker='o', linestyle='-', label='f1Score')

plt.axhline(y=0.95, color='black', linestyle='-', label='0.95')
plt.axvline(x=9, color='black', linestyle='-', label='9')
plt.xlabel('Valor de max_depth')
plt.ylabel('Performance')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Valores de n - n_estimators
n_values = list(range(1, 101))

# Lista para armazenar os valores da acurácia
list_acuracia_rf = []

# Lista para armazenar os valores da precision
list_precision_rf = []

# Lista para armazenar os valores da recall
list_recall_rf = []

# Lista para armazenar os valores da f1Score
list_f1Score_rf = []

# Lista para armazenar os valores da área sob a curva ROC
list_auc_rf = []

for n in n_values:
    # Treinando o algoritmo
    rf_clf = RandomForestClassifier(n_estimators=n).fit(x_train, y_train)
    
    # Classificação
    y_pred_rf = rf_clf.predict(x_vld)
    y_pred_proba_rf = rf_clf.predict_proba(x_vld)[:, 1]
    
    # Performance  
    accuracy_rf = mt.accuracy_score(y_vld, y_pred_rf)
    list_acuracia_rf.append(accuracy_rf)

    precision_rf = mt.precision_score(y_vld, y_pred_rf, average='binary')
    list_precision_rf.append(precision_rf)

    recall_rf = mt.recall_score(y_vld, y_pred_rf, average='binary')
    list_recall_rf.append(recall_rf)

    f1Score_rf = mt.f1_score(y_vld, y_pred_rf)
    list_f1Score_rf.append(f1Score_rf)

    # Para calcular a curva Precision, Recall e Threshold
    precision, recall, threshold = mt.precision_recall_curve(y_vld, y_pred_proba_rf)

    # Para calcular a curva ROC
    fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred_proba_rf)

    # Para calcular a área sob a curva ROC
    auc = mt.auc(fpr, tpr)
    list_auc_rf.append(auc)

In [None]:
# Plot da acurácia em relação a n
plt.plot(n_values, list_acuracia_rf, marker='o', linestyle='-', label='Acurácia')
plt.plot(n_values, list_precision_rf, marker='o', linestyle='-', label='Precision')
plt.plot(n_values, list_recall_rf, marker='o', linestyle='-', label='Recall')
plt.plot(n_values, list_f1Score_rf, marker='o', linestyle='-', label='f1Score')

plt.axhline(y=0.9557830699606795, color='black', linestyle='-', label='0.9577905158936947')
plt.axvline(x=7, color='black', linestyle='-', label='7')
plt.xlabel('Valor de n_estimators')
plt.ylabel('Performance')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Para visualizar a Curva Precision vs Recall
plt.plot(recall, precision, marker='.', label='Model')
plt.xlabel("Recall")
plt.ylabel("Precision")

In [None]:
# Para visualizar a Curva Threshold, Precision x Recall
plt.plot( threshold, precision[:-1], 'b--', label='Precision' );
plt.plot( threshold, recall[:-1], 'g-', label='Recall' );
plt.xlabel( 'Thresholds' );
plt.ylabel( 'Precison, Recall' );
plt.legend()
plt.grid()

In [None]:
# Para visualizar a curva ROC
plt.plot(fpr, tpr, marker='.', label='Model(AUC=%0.2f)' % auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
# Para visualizar a curva Threshold, TPR x FPR
plt.plot(thresholds, tpr, 'b--', label='TPR')
plt.plot(thresholds, 1-fpr, 'g-', label='FPR')
plt.xlabel('True Positive Rate')
plt.ylabel('False Positive Rate')
plt.legend()
plt.grid()

In [None]:
# Para encontrar o melhor threshold baseado na curva ROC
distances = np.sqrt((1 - tpr) ** 2 + fpr ** 2)
best_threshold = thresholds[np.argmin(distances)]
print(f'Best Threshold: {best_threshold}')

In [None]:
# Para calcular accuracy usando o melhor threshold
aux_y_pred = (y_pred_proba_rf >= best_threshold).astype(int)
accuracy = mt.accuracy_score(y_vld, aux_y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
precision = mt.precision_score(y_vld, aux_y_pred, average='binary')
precision

### Para dados de teste

In [43]:
# clf_rf = RandomForestClassifier(n_estimators=7, max_depth=9)

In [44]:
rf = clf_rf.fit(x, y.values.ravel())

## Logistic Regression

### Para dados de treinamento

In [45]:
lr_clf = LogisticRegression()

In [46]:
lr = lr_clf.fit(x_train, y_train.values.ravel())

### Para dados de validação

In [47]:
lr_clf = LogisticRegression(solver='newton-cholesky')

In [48]:
lr = lr_clf.fit(x_train_vld, y_train_vld.values.ravel())

### Obtendo o melhor parametro

#### Cenário 1 - C= 1 a 200, Sover=default=’lbfgs’, max_iter=default=100

In [None]:
# Valores de C - Inverso da força de regularização
c_values = list(range(100, 201))

# Lista para armazenar os valores da acurácia
list_acuracia_lr = []

# Lista para armazenar os valores da precision
list_precision_lr = []

# Lista para armazenar os valores da recall
list_recall_lr = []

# Lista para armazenar os valores da f1Score
list_f1Score_lr = []

# Lista para armazenar os valores da área sob a curva ROC
list_auc_lr = []

for c in c_values:
    # Treinando o algoritmo
    rl_clf = LogisticRegression(C=c).fit(x_train, y_train)
    
    # Classificação
    
    y_pred = rl_clf.predict(x_vld)

    y_pred_proba = rl_clf.predict_proba(x_vld)[:, 1]
    
    # Performance  
    accuracy_rl = mt.accuracy_score(y_vld, y_pred)
    list_acuracia_lr.append(accuracy_rl)

    precision_rl = mt.precision_score(y_vld, y_pred, average='binary')
    list_precision_lr.append(precision_rl)

    recall_rl = mt.recall_score(y_vld, y_pred, average='binary')
    list_recall_lr.append(recall_rl)

    f1Score_rl = mt.f1_score(y_vld, y_pred)
    list_f1Score_lr.append(f1Score_rl)

    # Para calcular a curva Precision, Recall e Threshold
    precision, recall, threshold = mt.precision_recall_curve(y_vld, y_pred_proba)

    # Para calcular a curva ROC
    fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred_proba)

    # Para calcular a área sob a curva ROC
    auc = mt.auc(fpr, tpr)
    list_auc_tree.append(auc)

In [None]:
# Plot da acurácia em relação a k
plt.plot(c_values, list_acuracia_lr, marker='o', linestyle='-', label='Acurácia')
plt.plot(c_values, list_precision_lr, marker='o', linestyle='-', label='Precision')
plt.plot(c_values, list_recall_lr, marker='o', linestyle='-', label='Recall')
plt.plot(c_values, list_f1Score_lr, marker='o', linestyle='-', label='f1Score')

plt.xlabel('Valor de C')
plt.ylabel('Performance')
plt.legend()
plt.grid(True)
plt.show()

#### Cenário 2 - C=default=100, Sover=default=’lbfgs’, max_iter= 0 a 200

In [None]:
# Valores de max_iter - Número máximo de iterações tomadas para os solvers convergir.
iter_values = list(range(1, 201))

# Lista para armazenar os valores da acurácia
list_acuracia_lr = []

# Lista para armazenar os valores da precision
list_precision_lr = []

# Lista para armazenar os valores da recall
list_recall_lr = []

# Lista para armazenar os valores da f1Score
list_f1Score_lr = []

# Lista para armazenar os valores da área sob a curva ROC
list_auc_lr = []

for i in iter_values:
    # Treinando o algoritmo
    rl_clf = LogisticRegression(max_iter=i).fit(x_train, y_train)
    
    # Classificação
    
    y_pred = rl_clf.predict(x_vld)

    y_pred_proba = rl_clf.predict_proba(x_vld)[:, 1]
    
    # Performance  
    accuracy_rl = mt.accuracy_score(y_vld, y_pred)
    list_acuracia_lr.append(accuracy_rl)

    precision_rl = mt.precision_score(y_vld, y_pred, average='binary')
    list_precision_lr.append(precision_rl)

    recall_rl = mt.recall_score(y_vld, y_pred, average='binary')
    list_recall_lr.append(recall_rl)

    f1Score_rl = mt.f1_score(y_vld, y_pred)
    list_f1Score_lr.append(f1Score_rl)

    # Para calcular a curva Precision, Recall e Threshold
    precision, recall, threshold = mt.precision_recall_curve(y_vld, y_pred_proba)

    # Para calcular a curva ROC
    fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred_proba)

    # Para calcular a área sob a curva ROC
    auc = mt.auc(fpr, tpr)
    list_auc_tree.append(auc)

In [None]:
# Plot da acurácia em relação a k
plt.plot(iter_values, list_acuracia_lr, marker='o', linestyle='-', label='Acurácia')
plt.plot(iter_values, list_precision_lr, marker='o', linestyle='-', label='Precision')
plt.plot(iter_values, list_recall_lr, marker='o', linestyle='-', label='Recall')
plt.plot(iter_values, list_f1Score_lr, marker='o', linestyle='-', label='f1Score')

plt.xlabel('Valor de max_iter')
plt.ylabel('Performance')
plt.legend()
plt.grid(True)
plt.show()

#### Cenário 3 - C= 0 a 200, Sover=default=‘liblinear’, max_iter=default=100

In [None]:
# Valores de C - Inverso da força de regularização
c_values = list(range(1, 201))

# Lista para armazenar os valores da acurácia
list_acuracia_lr = []

# Lista para armazenar os valores da precision
list_precision_lr = []

# Lista para armazenar os valores da recall
list_recall_lr = []

# Lista para armazenar os valores da f1Score
list_f1Score_lr = []

# Lista para armazenar os valores da área sob a curva ROC
list_auc_lr = []

for c in c_values:
    # Treinando o algoritmo
    rl_clf = LogisticRegression(C=c, solver='newton-cholesky').fit(x_train, y_train)
    
    # Classificação
    
    y_pred = rl_clf.predict(x_vld)

    y_pred_proba = rl_clf.predict_proba(x_vld)[:, 1]
    
    # Performance  
    accuracy_rl = mt.accuracy_score(y_vld, y_pred)
    list_acuracia_lr.append(accuracy_rl)

    precision_rl = mt.precision_score(y_vld, y_pred, average='binary')
    list_precision_lr.append(precision_rl)

    recall_rl = mt.recall_score(y_vld, y_pred, average='binary')
    list_recall_lr.append(recall_rl)

    f1Score_rl = mt.f1_score(y_vld, y_pred)
    list_f1Score_lr.append(f1Score_rl)

    # Para calcular a curva Precision, Recall e Threshold
    precision, recall, threshold = mt.precision_recall_curve(y_vld, y_pred_proba)

    # Para calcular a curva ROC
    fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred_proba)

    # Para calcular a área sob a curva ROC
    auc = mt.auc(fpr, tpr)
    list_auc_tree.append(auc)

In [None]:
# Plot da acurácia em relação a k
plt.plot(c_values, list_acuracia_lr, marker='o', linestyle='-', label='Acurácia')
plt.plot(c_values, list_precision_lr, marker='o', linestyle='-', label='Precision')
plt.plot(c_values, list_recall_lr, marker='o', linestyle='-', label='Recall')
plt.plot(c_values, list_f1Score_lr, marker='o', linestyle='-', label='f1Score')

plt.axhline(y=0.8813, color='black', linestyle='-', label='0.88')
plt.axvline(x=163, color='black', linestyle='-', label='9')

plt.xlabel('Valor de C')
plt.ylabel('Performance')
plt.legend()
plt.grid(True)
plt.show()

#### Cenário 4 - C=default=100, Sover=default=‘liblinear’, max_iter= 0 a 200

In [None]:
# Valores de max_iter - Número máximo de iterações tomadas para os solvers convergir.
iter_values = list(range(1, 200))

# Lista para armazenar os valores da acurácia
list_acuracia_lr = []

# Lista para armazenar os valores da precision
list_precision_lr = []

# Lista para armazenar os valores da recall
list_recall_lr = []

# Lista para armazenar os valores da f1Score
list_f1Score_lr = []

# Lista para armazenar os valores da área sob a curva ROC
list_auc_lr = []

for i in iter_values:
    # Treinando o algoritmo
    rl_clf = LogisticRegression(solver='newton-cholesky', max_iter=i).fit(x_train, y_train)
    
    # Classificação
    
    y_pred = rl_clf.predict(x_vld)

    y_pred_proba = rl_clf.predict_proba(x_vld)[:, 1]
    
    # Performance  
    accuracy_rl = mt.accuracy_score(y_vld, y_pred)
    list_acuracia_lr.append(accuracy_rl)

    precision_rl = mt.precision_score(y_vld, y_pred, average='binary')
    list_precision_lr.append(precision_rl)

    recall_rl = mt.recall_score(y_vld, y_pred, average='binary')
    list_recall_lr.append(recall_rl)

    f1Score_rl = mt.f1_score(y_vld, y_pred)
    list_f1Score_lr.append(f1Score_rl)

    # Para calcular a curva Precision, Recall e Threshold
    precision, recall, threshold = mt.precision_recall_curve(y_vld, y_pred_proba)

    # Para calcular a curva ROC
    fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred_proba)

    # Para calcular a área sob a curva ROC
    auc = mt.auc(fpr, tpr)
    list_auc_tree.append(auc)

In [None]:
# Plot da acurácia em relação a k
plt.plot(iter_values, list_acuracia_lr, marker='o', linestyle='-', label='Acurácia')
plt.plot(iter_values, list_precision_lr, marker='o', linestyle='-', label='Precision')
plt.plot(iter_values, list_recall_lr, marker='o', linestyle='-', label='Recall')
plt.plot(iter_values, list_f1Score_lr, marker='o', linestyle='-', label='f1Score')

plt.xlabel('Valor de max_iter')
plt.ylabel('Performance')
plt.legend()
plt.grid(True)
plt.show()

#### Cenário 5 - C=default=100, Sover=default=‘liblinear’, max_iter=20

In [None]:
# Valores de max_iter - Número máximo de iterações tomadas para os solvers convergir.
iter_values = list(range(1, 25))

# Lista para armazenar os valores da acurácia
list_acuracia_lr = []

# Lista para armazenar os valores da precision
list_precision_lr = []

# Lista para armazenar os valores da recall
list_recall_lr = []

# Lista para armazenar os valores da f1Score
list_f1Score_lr = []

# Lista para armazenar os valores da área sob a curva ROC
list_auc_lr = []

# Treinando o algoritmo
rl_clf = LogisticRegression(solver='newton-cholesky').fit(x_train, y_train)
    
# Classificação

y_pred = rl_clf.predict(x_vld)

y_pred_proba = rl_clf.predict_proba(x_vld)[:, 1]

# Performance  
accuracy_rl = mt.accuracy_score(y_vld, y_pred)
list_acuracia_lr.append(accuracy_rl)

precision_rl = mt.precision_score(y_vld, y_pred, average='binary')
list_precision_lr.append(precision_rl)

recall_rl = mt.recall_score(y_vld, y_pred, average='binary')
list_recall_lr.append(recall_rl)

f1Score_rl = mt.f1_score(y_vld, y_pred)
list_f1Score_lr.append(f1Score_rl)

# Para calcular a curva Precision, Recall e Threshold
precision, recall, threshold = mt.precision_recall_curve(y_vld, y_pred_proba)

# Para calcular a curva ROC
fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred_proba)

# Para calcular a área sob a curva ROC
auc = mt.auc(fpr, tpr)
list_auc_tree.append(auc)

In [None]:
# Plot da acurácia em relação a k
plt.plot(20, list_acuracia_lr, marker='o', linestyle='-', label='Acurácia')
plt.plot(20, list_precision_lr, marker='o', linestyle='-', label='Precision')
plt.plot(20, list_recall_lr, marker='o', linestyle='-', label='Recall')
plt.plot(20, list_f1Score_lr, marker='o', linestyle='-', label='f1Score')

plt.xlabel('Valor de max_iter')
plt.ylabel('Performance')
plt.legend()
plt.grid(True)
plt.show()

### Para dados de teste

In [49]:
# lr_clf = LogisticRegression(solver='newton-cholesky')

In [50]:
lr = lr_clf.fit(x, y.values.ravel())

# Predicting observations

## KNN

### Para dados de treinamento

In [52]:
# Para validar e encontrar os melhores parametros para pode encontrar a performance sobre os dados de treinamento
y_pred_knn_train = knn_classifier.predict(x_vld)

### Para dados de validação

In [53]:
# Para encontrar a performance sobre os dados de validação
y_pred_knn_vld = knn_classifier.predict(x_test)

### Para dados de teste

In [54]:
y_pred_knn_test = knn_classifier.predict(x_test)

#### KNN

In [None]:
x_train

In [None]:
df = x_train.loc[:, ['id', 'customer_type']]
df['predicted'] = y_pred

In [None]:
df

In [None]:
sns.scatterplot(x=x_train['class'],y=x_train['age'], hue=y_pred) # Tentativa apenas

## Decision Tree

### Para dados de treinamento

In [55]:
y_pred_tree_train = tree.predict(x_vld)

In [56]:
y_pred_proba_tree_train = tree.predict_proba(x_vld)[:, 1]

### Para dados de validação

In [57]:
y_pred_tree_vld = tree.predict(x_test)

In [58]:
y_pred_proba_tree_vld = tree.predict_proba(x_test)[:, 1]

### Para dados de teste

In [59]:
y_pred_tree_test = tree.predict(x_test)

In [60]:
y_pred_proba_tree_test = tree.predict_proba(x_test)[:, 1]

## Random Forest

### Para dados de treinamento

In [61]:
y_pred_rf_train = rf.predict(x_vld)

In [62]:
y_pred_proba_rf_train = rf.predict_proba(x_vld)[:, 1]

### Para dados de validação

In [63]:
y_pred_rf_vld = rf.predict(x_test)

In [64]:
y_pred_proba_rf_vld = rf.predict_proba(x_test)[:, 1]

### Para dados de teste

In [65]:
y_pred_rf_test = rf.predict(x_test)

In [66]:
y_pred_proba_rf_test = rf.predict_proba(x_test)[:, 1]

## Logistic Regression

### Para dados de treinamento

In [67]:
y_pred_lr_train = lr.predict(x_vld)

In [68]:
y_pred_prob_lr_train = lr.predict_proba(x_vld)[:, 1]

### Para dados de validação

In [69]:
y_pred_lr_vld = lr.predict(x_test)

In [70]:
y_pred_prob_lr_vld = lr.predict_proba(x_test)[:, 1]

### Para dados de teste

In [71]:
y_pred_lr_test = lr.predict(x_test)

In [72]:
y_pred_prob_lr_test = lr.predict_proba(x_test)[:, 1]

# Performance

In [73]:
names_algorithm = ['KNN', 'Decision Tree', 'Random Forest', 'Logistic Regression']

In [74]:
names_metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

In [75]:
list_accuracy_train, list_precision_train, list_recall_train, list_f1Score_train = [], [], [], []

In [76]:
list_accuracy_vld, list_precision_vld, list_recall_vld, list_f1Score_vld = [], [], [], []

In [77]:
list_accuracy_test, list_precision_test, list_recall_test, list_f1Score_test = [], [], [], []

## KNN

### Para dados de treinamento

In [78]:
# Confusion matrix
mt.confusion_matrix(y_vld, y_pred_knn_train)

array([[14614,  2996],
       [ 3342, 10127]], dtype=int64)

In [79]:
get_metrics(y_vld, y_pred_knn_train, list_accuracy_train, list_precision_train, list_recall_train, list_f1Score_train)

In [None]:
type(y_vld)

### Para dados de validação

In [80]:
# Confusion matrix
mt.confusion_matrix(y_test, y_pred_knn_vld)

array([[12067,  2461],
       [ 2778,  8587]], dtype=int64)

In [81]:
get_metrics(y_test, y_pred_knn_vld, list_accuracy_vld, list_precision_vld, list_recall_vld, list_f1Score_vld)

### Para dados de teste

In [82]:
# Confusion matrix
mt.confusion_matrix(y_test, y_pred_knn_test)

array([[12067,  2461],
       [ 2778,  8587]], dtype=int64)

In [83]:
get_metrics(y_test, y_pred_knn_test, list_accuracy_test, list_precision_test, list_recall_test, list_f1Score_test)

## Decision Tree

### Para dados de treinamentos

In [84]:
# Confusion matrix
mt.confusion_matrix(y_vld, y_pred_tree_train)

array([[17402,   208],
       [  430, 13039]], dtype=int64)

In [85]:
get_metrics(y_vld, y_pred_tree_train, list_accuracy_train, list_precision_train, list_recall_train, list_f1Score_train)

### Para dados de validação

In [86]:
# Confusion matrix
mt.confusion_matrix(y_test, y_pred_tree_vld)

array([[14342,   186],
       [  364, 11001]], dtype=int64)

In [87]:
get_metrics(y_test, y_pred_tree_vld, list_accuracy_vld, list_precision_vld, list_recall_vld, list_f1Score_vld)

#### Calculando e visualizando as curvas

In [None]:
# Para calcular a curva Precision, Recall e Threshold
precision, recall, threshold = mt.precision_recall_curve(y_vld, y_pred)

In [None]:
# Para visualizar a Curva Precision vs Recall
plt.plot(recall, precision, marker='.', label='Model')
plt.xlabel("Recall")
plt.ylabel("Precision")

In [None]:
# Para visualizar a Curva Threshold, Precision x Recall
plt.plot( threshold, precision[:-1], 'b--', label='Precision' );
plt.plot( threshold, recall[:-1], 'g-', label='Recall' );
plt.xlabel( 'Thresholds' );
plt.ylabel( 'Precison, Recall' );
plt.legend()
plt.grid()

In [None]:
# Para calcular a curva ROC
fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred)

In [None]:
# Para visualizar a curva ROC
plt.plot(fpr, tpr, marker='.', label='Model(AUC=%0.2f)' % auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
# Para calcular a área sob a curva ROC
auc = mt.auc(fpr, tpr)
auc

In [None]:
# Para visualizar a curva Threshold, TPR x FPR
plt.plot(thresholds, tpr, 'b--', label='TPR')
plt.plot(thresholds, 1-fpr, 'g-', label='FPR')
plt.xlabel('True Positive Rate')
plt.ylabel('False Positive Rate')
plt.legend()
plt.grid()

### Para dados de teste

In [88]:
# Confusion matrix
mt.confusion_matrix(y_test, y_pred_tree_test)

array([[14342,   186],
       [  364, 11001]], dtype=int64)

In [89]:
get_metrics(y_test, y_pred_tree_test, list_accuracy_test, list_precision_test, list_recall_test, list_f1Score_test)

## Random Forest

### Para dados de treinamento

In [90]:
# Confusion matrix
mt.confusion_matrix(y_vld, y_pred_rf_train)

array([[17006,   604],
       [  961, 12508]], dtype=int64)

In [91]:
get_metrics(y_vld, y_pred_rf_train, list_accuracy_train, list_precision_train, list_recall_train, list_f1Score_train)

### Para dados de validação

In [92]:
# Confusion matrix
mt.confusion_matrix(y_test, y_pred_rf_vld)

array([[14006,   522],
       [  799, 10566]], dtype=int64)

In [93]:
get_metrics(y_test, y_pred_rf_vld, list_accuracy_vld, list_precision_vld, list_recall_vld, list_f1Score_vld)

#### Calculando e visualizando as curvas

In [None]:
# Para calcular o valor da curva ROC
fpr, tpr, thresholds = mt.roc_curve(y_vld, y_scores)

In [None]:
# Para calcular AUC score
auc = mt.roc_auc_score(y_vld, y_scores)
auc

In [None]:
# Para plotar a curva ROC
plt.plot(fpr, tpr, color='b', label=f'curva ROC (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='r', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

In [None]:
# Para encontrar o melhor threshold baseado na curva ROC
distances = np.sqrt((1 - tpr) ** 2 + fpr ** 2)
best_threshold = thresholds[np.argmin(distances)]
print(f'Best Threshold: {best_threshold}')

In [None]:
# Para calcular accuracy usando o melhor threshold
y_pred = (y_scores >= best_threshold).astype(int)
accuracy = mt.accuracy_score(y_vld, y_pred)
print(f'Accuracy: {accuracy}')

### Para dados de teste

In [94]:
# Confusion matrix
mt.confusion_matrix(y_test, y_pred_rf_test)

array([[14006,   522],
       [  799, 10566]], dtype=int64)

In [95]:
get_metrics(y_test, y_pred_rf_test, list_accuracy_test, list_precision_test, list_recall_test, list_f1Score_test)

## Logistic Regression

### Para dados de treinamento

In [96]:
# Confusion matrix
mt.confusion_matrix(y_vld, y_pred_lr_train)

array([[15913,  1697],
       [ 2209, 11260]], dtype=int64)

In [97]:
get_metrics(y_vld, y_pred_lr_train, list_accuracy_train, list_precision_train, list_recall_train, list_f1Score_train)

### Para dados de validação

In [98]:
# Confusion matrix
mt.confusion_matrix(y_test, y_pred_lr_vld)

array([[13080,  1448],
       [ 1867,  9498]], dtype=int64)

In [99]:
get_metrics(y_test, y_pred_lr_vld, list_accuracy_vld, list_precision_vld, list_recall_vld, list_f1Score_vld)

#### Calculando e visualizando as curvas

In [None]:
# Para calcular a curva ROC
fpr, tpr, thresholds = mt.roc_curve(y_vld, y_pred_prob)

In [None]:
# Para calcular a área sob a curva ROC
auc = mt.auc(fpr, tpr)
auc

In [None]:
# Para visualizar a curva ROC
plt.plot(fpr, tpr, marker='.', label='Model(AUC=%0.2f)' % auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

### Para dados de teste

In [100]:
# Confusion matrix
mt.confusion_matrix(y_test, y_pred_lr_test)

array([[13080,  1448],
       [ 1867,  9498]], dtype=int64)

In [101]:
get_metrics(y_test, y_pred_lr_test, list_accuracy_test, list_precision_test, list_recall_test, list_f1Score_test)

## Tabela de Performance sobre os dados de treinamento

In [102]:
values_metrics_train = [list_accuracy_train, list_precision_train, list_recall_train, list_f1Score_train]
df_tab_train = show_table(names_algorithm, names_metrics, values_metrics_train)
df_tab_train

Unnamed: 0,Algorithm Name,Accuracy,Precision,Recall,F1-Score
0,KNN,0.796068,0.771699,0.751875,0.761658
1,Decision Tree,0.979472,0.984298,0.968075,0.976119
2,Random Forest,0.949644,0.953935,0.928651,0.941123
3,Logistic Regression,0.87432,0.869028,0.835994,0.852191


## Tabela de Performance sobre os dados de validação

In [103]:
values_metrics_vld = [list_accuracy_vld, list_precision_vld, list_recall_vld, list_f1Score_vld]
df_tab_vld = show_table(names_algorithm, names_metrics, values_metrics_vld)
df_tab_vld

Unnamed: 0,Algorithm Name,Accuracy,Precision,Recall,F1-Score
0,KNN,0.797667,0.777245,0.755565,0.766252
1,Decision Tree,0.978759,0.983374,0.967972,0.975612
2,Random Forest,0.948982,0.952922,0.929696,0.941166
3,Logistic Regression,0.871973,0.867714,0.835724,0.851419


## Tabela de Performance sobre os dados de Teste

In [104]:
values_metrics_test = [list_accuracy_test, list_precision_test, list_recall_test, list_f1Score_test]
df_tab_test = show_table(names_algorithm, names_metrics, values_metrics_test)
df_tab_test

Unnamed: 0,Algorithm Name,Accuracy,Precision,Recall,F1-Score
0,KNN,0.797667,0.777245,0.755565,0.766252
1,Decision Tree,0.978759,0.983374,0.967972,0.975612
2,Random Forest,0.948982,0.952922,0.929696,0.941166
3,Logistic Regression,0.871973,0.867714,0.835724,0.851419
