In [3]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

#División datos
from sklearn.model_selection import train_test_split
#Cross-Validation
from sklearn.model_selection import cross_validate

#Metricas
import sklearn
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score, recall_score, precision_score, confusion_matrix

#ROC y AUC
from sklearn.metrics import roc_curve, roc_auc_score


#Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


UCI ML Repo: Breast Cancer Wisconsin (Diagnostic) Data Set

http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29

In [4]:
breast_c = pd.read_csv('/Users/sebaguerraty-macbook/Github/Aux-IN6531-/Aux Clasification/data/breast-cancer-wisconsin.data')

In [5]:
breast_c.columns  = ['ID number', 'Clump_thickness', 'Uniformity_cell_size', 'Uniformity_cell_shape', 'Marginal_adhesion', 'Single_e_cell_size', 'Bare_nuclei', 'Bland_chromatin', 'Normal_nucleoli', 'Mitoses', 'Class']

In [6]:
breast_c.head()

Unnamed: 0,ID number,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [7]:
## EDA + transformaciones de variable y Dummies (no applica en este caso)

In [8]:
breast_c[ breast_c['Bare_nuclei'] =='?'].shape

(16, 11)

In [9]:
breast_c.dtypes

ID number                 int64
Clump_thickness           int64
Uniformity_cell_size      int64
Uniformity_cell_shape     int64
Marginal_adhesion         int64
Single_e_cell_size        int64
Bare_nuclei              object
Bland_chromatin           int64
Normal_nucleoli           int64
Mitoses                   int64
Class                     int64
dtype: object

In [10]:
breast_c['Bare_nuclei'] = pd.to_numeric(breast_c['Bare_nuclei'], errors='coerce')

In [11]:
breast_c['Bare_nuclei'].fillna(method='ffill', inplace=True)

7. Attribute Information: (class attribute has been moved to last column)

   #  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)

In [12]:
breast_c['Class'].replace(4, 1, inplace=True)
breast_c['Class'].replace(2, 0, inplace=True)

In [13]:
breast_c.head()

Unnamed: 0,ID number,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10.0,3,2,1,0
1,1015425,3,1,1,1,2,2.0,3,1,1,0
2,1016277,6,8,8,1,3,4.0,3,7,1,0
3,1017023,4,1,1,3,2,1.0,3,1,1,0
4,1017122,8,10,10,8,7,10.0,9,7,1,1


In [14]:
X_train, X_test, y_train, y_test = train_test_split(breast_c.drop(columns=['ID number', 'Class']), breast_c['Class'], test_size=0.2, random_state=123, stratify=breast_c.Class)

In [15]:
models = []
models.append(('LOGIT  ', LogisticRegression()))
models.append(('KNN    ', KNeighborsClassifier()))
models.append(('D_TREE ', DecisionTreeClassifier()))
models.append(('RandomForest ', RandomForestClassifier()))
models.append(('SVM    ', SVC()))

In [19]:

for name, model in models:
    #Ajuste del modelo
    model.fit(X_train, y_train)
    
    
    #Predecir
    y_pred = model.predict(X_test)
    
    #Metricas de testeo
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall  = recall_score(y_test, y_pred)
    
    #metricas de conjunto de entrenamiento
    train_pred = model.predict(X_train)

    train_acc = accuracy_score(y_train, train_pred)
    train_precision = precision_score(y_train, train_pred)
    train_recall = recall_score(y_train, train_pred)
    #Print de datos
    
    print("Modelo: {}".format(name))
    print('train accuracy: {:.2f}, train precision: {:.2f}, train recall: {:.2f}'.format(train_acc, train_precision, train_recall))
    print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}".format(accuracy, precision, recall))
    print("")
    

Modelo: LOGIT  
train accuracy: 0.97, train precision: 0.96, train recall: 0.96
accuracy: 0.94, precision: 0.93, recall: 0.88

Modelo: KNN    
train accuracy: 0.98, train precision: 0.97, train recall: 0.98
accuracy: 0.94, precision: 0.93, recall: 0.90

Modelo: D_TREE 
train accuracy: 1.00, train precision: 1.00, train recall: 1.00
accuracy: 0.91, precision: 0.93, recall: 0.79

Modelo: RandomForest 
train accuracy: 1.00, train precision: 1.00, train recall: 1.00
accuracy: 0.94, precision: 0.93, recall: 0.88

Modelo: SVM    
train accuracy: 0.98, train precision: 0.96, train recall: 0.97
accuracy: 0.95, precision: 0.94, recall: 0.92



In [25]:
help(LogisticRegression.predict_proba)

Help on function predict_proba in module sklearn.linear_model._logistic:

predict_proba(self, X)
    Probability estimates.
    
    The returned estimates for all classes are ordered by the
    label of classes.
    
    For a multi_class problem, if multi_class is set to be "multinomial"
    the softmax function is used to find the predicted probability of
    each class.
    Else use a one-vs-rest approach, i.e calculate the probability
    of each class assuming it to be positive using the logistic function.
    and normalize these values across all the classes.
    
    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Vector to be scored, where `n_samples` is the number of samples and
        `n_features` is the number of features.
    
    Returns
    -------
    T : array-like of shape (n_samples, n_classes)
        Returns the probability of the sample for each class in the model,
        where classes are ordered as they are in ``self.class

ROC, AUC Precision y Recall

<img src="https://upload.wikimedia.org/wikipedia/commons/2/26/Precisionrecall.svg" alt="precision and recall" width=400 height=500 >

F1 score = 2 · 
Precision · Recall
/
Precision + Recall

Accuracy (ACC) = 
Σ True positive + Σ True negative
/
Σ Total population

<img src="https://miro.medium.com/max/1400/1*ESe25wSUx7vE-v3w2-v77A.png" alt="balanced dataset" width="500" height="400">

<img src="https://miro.medium.com/max/1400/1*Ij5JIzexbW8JiJSPuDa01g.png" alt="Imbalanced dataset" width=500 height=400>