# Imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import metrics

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer(as_frame=True)
df_cancer = data.frame
df_cancer.head(5)

# Analisis del Dataset

In [None]:
correlation = df_cancer.corr()
threshold = 0.75
filter = np.abs(correlation["target"]) > threshold
correlation_features = correlation.columns[filter].tolist()
sns.pairplot(df_cancer[correlation_features], diag_kind = "kde",  hue="target")
plt.show()

# Preprocesamiento de datos

In [None]:
def preparacion_de_datos(df_input, target, escalar_valores=False):
  df = df_input 
  if escalar_valores: 
    scaler_X = StandardScaler(with_mean=True, with_std=True)
    scaler_X.fit(df.drop(target,axis=1))
    x = pd.DataFrame(scaler_X.transform(df.drop(target,axis=1),), columns = df.drop(target,axis=1).columns )
  else:
    x = df.drop(target,axis=1)
  y = df[target]
  return x,y

In [None]:
X_cancer, y_cancer = preparacion_de_datos( df_cancer, "target" ,escalar_valores=False)

In [None]:
# Hacemos el Split 70-30 para train-test
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,
                                                    test_size=0.3, stratify = y_cancer, random_state=0)

# Entrenamos los modelos

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

logreg = LogisticRegression( max_iter=3000 )
logreg.fit(X_train, y_train)
y_pred_lg = logreg.predict(X_test)

treeclf = DecisionTreeClassifier(max_depth=10, random_state=1)
treeclf.fit(X_train, y_train)
y_pred_tc = treeclf.predict(X_test)

bayes_multi = MultinomialNB()
bayes_multi.fit(X_train, y_train)
y_pred_nb = bayes_multi.predict(X_test)

bayes_gauss = GaussianNB()
bayes_gauss.fit(X_train, y_train)
y_pred_gauss = bayes_gauss.predict(X_test)

knn = DecisionTreeClassifier(max_depth=10, random_state=1)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# y con el dataset escalado
X_cancer, y_cancer = preparacion_de_datos(df_cancer, "target", escalar_valores=True)
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,
                                                    test_size=0.3, stratify = y_cancer, random_state=0)
logreg_s = LogisticRegression(max_iter=1000)
logreg_s.fit(X_train, y_train)
y_pred_lg_s = logreg_s.predict(X_test)

bayes_gauss_s = GaussianNB()
bayes_gauss_s.fit(X_train, y_train)
y_pred_gauss_s = bayes_gauss_s.predict(X_test)

knn_s = DecisionTreeClassifier(max_depth=10, random_state=1)
knn_s.fit(X_train, y_train)
y_pred_knn_s = knn_s.predict(X_test)

# Comparamos por Curva ROC

In [None]:
def graficarCurvaRoc( y_pred, model ):
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred)
  auc = metrics.roc_auc_score(y_test, y_pred)
  # Graficamos
  plt.plot(fpr,tpr,label= model +" AUC="+str(round(auc,4))) #,label= "AUC="+str(auc))
  plt.legend(loc=4, fontsize=12)
  return auc

In [None]:
# Inicializamos los labels del gráfico
plt.figure(figsize=(20, 10))
plt.xlabel('% 1 – Specificity (falsos positivos)', fontsize=14)
plt.ylabel('% Sensitivity (positivos)', fontsize=14)

# Graficamos la recta del azar
it = [i/100 for i in range(100)]
plt.plot(it,it,label="AZAR AUC=0.5",color="black")

modelos = {'bayesMulti':y_pred_nb, 'bayesGauss':y_pred_gauss,'arbol':y_pred_tc,
           'reglog':y_pred_lg,'knn':y_pred_knn, 'bayesGauss_s':y_pred_gauss_s,
           'reglog_s':y_pred_lg_s,'knn_s':y_pred_knn_s,
           }
areas = []
for pred in modelos:
    auc = graficarCurvaRoc( modelos[pred] , pred )
    areas.append( (pred, auc) )
areas = pd.DataFrame(areas, columns=['model','auc'])
# Agregamos el titulo y configuro el tamaño de letra
plt.title("Curva ROC", fontsize=14)
plt.tick_params(labelsize=12);
plt.show()

In [None]:
areas.sort_values('auc', ascending=False)

# Mejoramos el modelo elegido con GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

parameters =  {"C":np.logspace(-3,3,13), "penalty":["l1","l2"], "max_iter":[100,500,3000]}
clf = GridSearchCV( LogisticRegression() , parameters, scoring='roc_auc',cv=5) #CV agrega cross validation de 5 capas
clf.fit(X_train, y_train)

In [None]:
# Print best parameters and best score
print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)

# Ejercicio
### Analisis
- Averiguar distribución de la variable target.
- Averiguar cuales de las variables son numéricas.
- Graficar Heatmap de la correlacion entre variables numericas y el target.
### Evaluacion de Modelos
- Incluir el modelo obtenido por gridsearch en el gráfico de la curva ROC
- Comparar Accuracy de los modelos RegLog, Arbol y Naive Bayes (Gaussiano)