In [20]:
import numpy as np
import numpy.typing as npt
import matplotlib.pyplot as plt  # Biblioteca para gerar gráficos
import pandas as pd
from sklearn import metrics, model_selection
from scipy import stats
import math


In [21]:
data = np.genfromtxt('breastcancer.csv', delimiter=',')

## Questão 1

Considere o conjunto de dados disponível em `breastcancer.csv`, organizado
em 31 colunas, sendo as 30 primeiras colunas os atributos e a última coluna a
saída. Os 30 atributos coletados de exames médicos são usados no diagnóstico
do câncer de mama, sendo 1 a classe positiva e 0 a classe negativa. Maiores 
detalhes sobre os dados podem ser conferidos em 
https://scikit-learn.org/stable/datasets/toy_dataset.html#breast-cancer-dataset.

a) Considerando uma validação cruzada em 10 folds, avalie modelos de classicação binária nos dados em questão. Para tanto, use as abordagens abaixo:
- Regressão logística (treinado com GD ou SGD);
- Análise do discriminante Gaussiano;
- Naive Bayes Gaussiano;

In [22]:
def k_fold_split(array, k = int):
    shuffled_data = np.random.permutation(array)
    folds = np.array_split(shuffled_data, k)
    return folds

folds = k_fold_split(data, 10)

In [23]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

def GD(X, y, learning_rate, epochs):
    n = y.shape[0]
    # w = np.random.randn(X.shape[1])
    w = np.zeros(X.shape[1])

    for t in range(epochs):
        ei_sum = np.zeros(X.shape[1])
        for i in range(n):
            ei = y[i] - sigmoid(np.dot(w.T, X[i]))
            ei_sum += np.dot(ei, X[i])
        w = w + learning_rate * (ei_sum/n)
        # w = w + learning_rate * np.mean(np.dot(e, X))
    return w

def log_reg_GD(train_set, test_set, learning_rate, epochs):
    n_train = train_set.shape[0]
    x_train = train_set[:, np.arange(30)]
    x_train = stats.zscore(x_train)
    X_train = np.c_[np.ones(n_train), x_train]
    y_train = train_set[:, 30]

    w_train = GD(X_train, y_train, learning_rate, epochs)

    n_test = test_set.shape[0]
    x_test = test_set[:, np.arange(30)]
    x_test = stats.zscore(x_test)
    X_test = np.c_[np.ones(n_test), x_test]
    y_test = test_set[:, 30]

    y_pred = sigmoid(np.dot(w_train, X_test.T))
    y_pred = np.around(y_pred)

    return (y_test, y_pred)


In [24]:
# todo analise de discriminante gaussiano

In [25]:
# todo naive bayes

b) Para cada modelo criado, reporte valor médio e desvio padrão das métricas de **acurácia**, **revocação**, **precisão** e **F1-score**

In [26]:
def is_true_positive(y, y_pred):
    return y_pred >= 1 and y >= 1

def is_false_positive(y, y_pred):
    return y_pred >= 1 and y <= 0

def is_true_negative(y, y_pred):
    return y_pred <= 0 and y <= 0

def is_false_negative(y, y_pred):
    return y_pred <= 0 and y >= 1

def classify(y, y_pred):
    """ returns (tp, fp, tn, fn) """

    tp, fp, tn, fn = 0, 0, 0, 0
    for i, pred in enumerate(y_pred):
        tp += 1 if is_true_positive(y[i], pred) else 0
        fp += 1 if is_false_positive(y[i], pred) else 0
        tn += 1 if is_true_negative(y[i], pred) else 0
        fn += 1 if is_false_negative(y[i], pred) else 0
    return (tp, fp, tn, fn)

def accuracy(y, y_pred):
    tp, fp, tn, fn = classify(y, y_pred)
    return (tp + tn) / (tp + fp + tn + fn)

def precision(y, y_pred):
    tp, fp, tn, fn = classify(y, y_pred)
    return tp / (tp + fp)

def recall(y, y_pred):
    tp, fp, tn, fn = classify(y, y_pred)
    return tp / (tp + fn)

def f1_score(y, y_pred):
    precision_ = precision(y, y_pred)
    recall_ = recall(y, y_pred)
    return 2 * (precision_ * recall_) / (precision_ + recall_)
    
accuracy_arr, precision_arr, recall_arr, f1_score_arr = [], [], [], []

for i, fold in enumerate(folds):
    test_set = fold
    train_sets = [x for j, x in enumerate(folds) if j != i]
    train_sets = np.vstack(train_sets)

    y_test, y_pred = log_reg_GD(train_sets, test_set, 0.01, 100)

    accuracy_arr.append(accuracy(y_test, y_pred))
    # accuracy_arr.append(metrics.accuracy_score(y_test, y_pred))
    precision_arr.append(precision(y_test, y_pred))
    recall_arr.append(recall(y_test, y_pred))
    f1_score_arr.append(f1_score(y_test, y_pred))

print("10-fold cross validation com regressão logistica (GD)")
print("acurácia: %.8f +/- %.8f" % (np.mean(accuracy_arr), np.std(accuracy_arr)))
print("revocação: %.8f +/- %.8f" % (np.mean(precision_arr), np.std(precision_arr)))
print("precisão: %.8f +/- %.8f" % (np.mean(recall_arr), np.std(recall_arr)))
print("f1-score: %.8f +/- %.8f" % (np.mean(f1_score_arr), np.std(f1_score_arr)))


10-fold cross validation com regressão logistica (GD)
acurácia: 0.94382832 +/- 0.02910468
revocação: 0.95011939 +/- 0.05285517
precisão: 0.95954060 +/- 0.03079356
f1-score: 0.95348712 +/- 0.02728494
