In [853]:
import numpy as np
import numpy.typing as npt
import matplotlib.pyplot as plt  # Biblioteca para gerar gráficos
import pandas as pd
from sklearn import metrics, model_selection
from scipy import stats
import math


In [854]:
data = np.genfromtxt('breastcancer.csv', delimiter=',')

## Questão 1

Considere o conjunto de dados disponível em `breastcancer.csv`, organizado
em 31 colunas, sendo as 30 primeiras colunas os atributos e a última coluna a
saída. Os 30 atributos coletados de exames médicos são usados no diagnóstico
do câncer de mama, sendo 1 a classe positiva e 0 a classe negativa. Maiores 
detalhes sobre os dados podem ser conferidos em 
https://scikit-learn.org/stable/datasets/toy_dataset.html#breast-cancer-dataset.

a) Considerando uma validação cruzada em 10 folds, avalie modelos de classicação binária nos dados em questão. Para tanto, use as abordagens abaixo:
- Regressão logística (treinado com GD ou SGD);
- Análise do discriminante Gaussiano;
- Naive Bayes Gaussiano;

In [855]:

class Set:
    def __init__(self, dataset, features, output):
        self.dataset = dataset
        self.features = features
        self.output = output

    def get_n(self):
        return self.dataset.shape[0]

    def get_x(self):
        return self.dataset[:, self.features]

    def set_x(self, new_x):
        self.x = new_x

    def get_y(self):
        return self.dataset[:, self.output]

    def get_X(self, normal_fun=None):
        if (normal_fun):
            return np.c_[np.ones(self.get_n()), normal_fun(self.get_x())]
        else:
            return np.c_[np.ones(self.get_n()), self.get_x()]


In [856]:
def k_fold_split(array, k = int):
    """realiza o split dos dados em k-folds"""
    shuffled_data = np.random.permutation(array)
    folds = np.array_split(shuffled_data, k)
    return folds

def k_fold_train_test(folds):
    """retorna um vetor com as configurações de treino e teste definidas pelo k-fold split"""
    results = []
    for i, fold in enumerate(folds):
        train = np.vstack([x for j, x in enumerate(folds) if j != i])
        test = fold
        results.append((i, train, test))
    return results
    

folds = k_fold_split(data, 10)

In [857]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

def GD(X, y, learning_rate, epochs):
    # inicia valores
    n = y.shape[0]
    # w = np.random.randn(X.shape[1])
    w = np.zeros(X.shape[1])

    # atualiza pesos (w) em cada época
    for t in range(epochs):
        ei_sum = np.zeros(X.shape[1])
        for i in range(n):
            ei = y[i] - sigmoid(np.dot(w.T, X[i]))
            ei_sum += np.dot(ei, X[i])
        w = w + learning_rate * (ei_sum/n)
        # w = w + learning_rate * np.mean(np.dot(e, X))
    return w

def log_reg_GD_predict(train_set, test_set, learning_rate, epochs):
    features = np.arange(30)
    output = 30

    # pega colunas de treino
    train_set = Set(train_set, features, output)
    X_train = train_set.get_X(normal_fun=stats.zscore)
    y_train = train_set.get_y()

    # treina modelo
    w_train = GD(X_train, y_train, learning_rate, epochs)

    # pega colunas de teste
    test_set = Set(test_set, features, output)
    X_test = test_set.get_X(normal_fun=stats.zscore)
    y_test = test_set.get_y()

    # testa modelo
    y_pred = sigmoid(np.dot(w_train, X_test.T))
    y_pred = np.around(y_pred)

    return (y_test, y_pred)


In [858]:
# todo analise de discriminante gaussiano
def GDA(X, y):
    pCk_ = []
    mu_k_ = []
    sigma_k_ = []

    N = y.shape[0]
    for k in np.unique(y):
        # calcula p(Ck)
        n = y[y==k].shape[0]
        pCk = n/N
        pCk_.append(pCk)

        # calcula mu_k
        Xk = X[y==k]
        mu_k = Xk.mean()
        mu_k_.append(mu_k)
        
        # calcula sigma_k
        sigma_k = (Xk - mu_k).T.dot(Xk - mu_k) / (Xk.shape[0] - 1)
        sigma_k_.append(sigma_k)

    return (pCk_, mu_k_, sigma_k_)

def GDA_predict(train_set, test_set):
    features = np.arange(30)
    output = 30

    # pega colunas de treino
    train_set = Set(train_set, features, output)
    X_train = train_set.get_X()
    y_train = train_set.get_y()

    # treina modelo
    pC, mu, sigma = GDA(X_train, y_train)
    
    # pega colunas de teste
    test_set = Set(test_set, features, output)
    X_test = test_set.get_X()
    y_test = test_set.get_y()

    y_pred = []
    for i, y in enumerate(y_test):
        y_probabilities = []
        # para cada classe, verifica as probabilidades
        for k, classification in enumerate(np.unique(y_test)):
            eq1 = (-0.5 * np.log(np.linalg.det(sigma[k])))
            aux = X_test[i] - mu[k]
            eq2 = (-0.5 * aux) @ np.linalg.pinv(sigma[k]) @ aux.T
            eq3 = np.log(pC[k])

            prob = eq1 + eq2 + eq3

            y_probabilities.append(prob)

        # escolhe a mais provável
        y_pred.append(np.argmax(y_probabilities))
        # print(y_probabilities)

    return (y_test, y_pred)


In [859]:
# todo naive bayes
def naive_bayes():
    pass

b) Para cada modelo criado, reporte valor médio e desvio padrão das métricas de **acurácia**, **revocação**, **precisão** e **F1-score**

In [860]:
def is_true_positive(y, y_pred):
    return y_pred >= 1 and y >= 1

def is_false_positive(y, y_pred):
    return y_pred >= 1 and y <= 0

def is_true_negative(y, y_pred):
    return y_pred <= 0 and y <= 0

def is_false_negative(y, y_pred):
    return y_pred <= 0 and y >= 1

def verify_predictions(y, y_pred):
    """ returns (tp, fp, tn, fn) """

    tp, fp, tn, fn = 0, 0, 0, 0
    for i, pred in enumerate(y_pred):
        tp += 1 if is_true_positive(y[i], pred) else 0
        fp += 1 if is_false_positive(y[i], pred) else 0
        tn += 1 if is_true_negative(y[i], pred) else 0
        fn += 1 if is_false_negative(y[i], pred) else 0
    return (tp, fp, tn, fn)

def accuracy(y, y_pred):
    tp, fp, tn, fn = verify_predictions(y, y_pred)
    return (tp + tn) / (tp + fp + tn + fn)

def precision(y, y_pred):
    tp, fp, tn, fn = verify_predictions(y, y_pred)
    return tp / (tp + fp)

def recall(y, y_pred):
    tp, fp, tn, fn = verify_predictions(y, y_pred)
    return tp / (tp + fn)

def f1_score(y, y_pred):
    precision_ = precision(y, y_pred)
    recall_ = recall(y, y_pred)
    return 2 * (precision_ * recall_) / (precision_ + recall_)


In [861]:
gd_accuracy_arr, gd_precision_arr, gd_recall_arr, gd_f1_score_arr = [], [], [], []
for i, train, test in k_fold_train_test(folds):
      y_test, y_pred = log_reg_GD_predict(train, test, 0.01, 100)
      
      gd_accuracy_arr.append(accuracy(y_test, y_pred))
      # gd_accuracy_arr.append(metrics.accuracy_score(y_test, y_pred))
      gd_precision_arr.append(precision(y_test, y_pred))
      gd_recall_arr.append(recall(y_test, y_pred))
      gd_f1_score_arr.append(f1_score(y_test, y_pred))

print("10-fold cross validation com regressão logistica (GD)")
print("acurácia: %.8f +/- %.8f" %
      (np.mean(gd_accuracy_arr), np.std(gd_accuracy_arr)))
print("revocação: %.8f +/- %.8f" %
      (np.mean(gd_precision_arr), np.std(gd_precision_arr)))
print("precisão: %.8f +/- %.8f" % 
      (np.mean(gd_recall_arr), np.std(gd_recall_arr)))
print("f1-score: %.8f +/- %.8f" %
      (np.mean(gd_f1_score_arr), np.std(gd_f1_score_arr)))


10-fold cross validation com regressão logistica (GD)
acurácia: 0.93671679 +/- 0.02250935
revocação: 0.95675528 +/- 0.02937758
precisão: 0.94437170 +/- 0.04157466
f1-score: 0.94940196 +/- 0.01628936


In [862]:
gda_accuracy_arr, gda_precision_arr, gda_recall_arr, gda_f1_score_arr = [], [], [], []
for i, train, test in k_fold_train_test(folds):
      y_test, y_pred = GDA_predict(train, test)
      
      gda_accuracy_arr.append(accuracy(y_test, y_pred))
      gda_precision_arr.append(precision(y_test, y_pred))
      gda_recall_arr.append(recall(y_test, y_pred))
      gda_f1_score_arr.append(f1_score(y_test, y_pred))

print("10-fold cross validation com GDA")
print("acurácia: %.8f +/- %.8f" %
      (np.mean(gda_accuracy_arr), np.std(gda_accuracy_arr)))
print("revocação: %.8f +/- %.8f" %
      (np.mean(gda_precision_arr), np.std(gda_precision_arr)))
print("precisão: %.8f +/- %.8f" % 
      (np.mean(gda_recall_arr), np.std(gda_recall_arr)))
print("f1-score: %.8f +/- %.8f" %
      (np.mean(gda_f1_score_arr), np.std(gda_f1_score_arr)))

10-fold cross validation com GDA
acurácia: 0.95949248 +/- 0.02636797
revocação: 0.96743928 +/- 0.03143540
precisão: 0.96947485 +/- 0.02300408
f1-score: 0.96804941 +/- 0.01931077
