In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
from sys import maxsize #para imprimir arrays completos
import numpy as np
import pandas as pd

from sklearn import preprocessing #para normalizar datos
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import roc_curve, auc, precision_score, recall_score

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Métodos particulares del data frame

In [3]:
#Recorre el data frame en su totalidad y modifica las etiquetas que representan los valores por textos mas comprensibles. 
#Ejemplo: para la columna cap-shape, cuando aparece un valor 'b' lo cambia por 'bell'

#A la columna class se la renombra como edible, y sus valores pasan a ser binarios, entonces todo lo que antes para la columna 
#'class' era 'edible', ahora para la columna 'edible' vale 1. Todas las filas que antes eran 'p' (poisonous) pasan a valor 0 
#con la nueva nomenclatura

#Similar a lo anterior se hizo para la columna 'bruises', ya que tenía datos binarios
def getDescriptForEachColumns(df):
    
    for column in df:
                
        ##Attribute Information: (classes: edible=e, poisonous=p)
        ##Decalramos a la clase como Dummies.
        if column == 'class':            
            df[column] = df[column].replace('e', 1)
            df[column] = df[column].replace('p', 0)
            df.rename(columns={'class': 'edible'}, inplace=True)
        #cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
        elif column == 'cap-shape':
            df[column] = df[column].replace('b', 'bell')
            df[column] = df[column].replace('c', 'conical')
            df[column] = df[column].replace('x', 'convex')
            df[column] = df[column].replace('f', 'flat')
            df[column] = df[column].replace('k', 'knobbed')
            df[column] = df[column].replace('s', 'sunken')
            
        #cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
        elif column == 'cap-surface':
            df[column] = df[column].replace('f', 'fibrous')
            df[column] = df[column].replace('g', 'grooves')
            df[column] = df[column].replace('y', 'scaly')
            df[column] = df[column].replace('s', 'smooth')
        
        #cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
        elif column == 'cap-color':
            df[column] = df[column].replace('n', 'brown')
            df[column] = df[column].replace('b', 'buff')
            df[column] = df[column].replace('c', 'cinnamon')
            df[column] = df[column].replace('g', 'gray')
            df[column] = df[column].replace('r', 'green')
            df[column] = df[column].replace('p', 'pink')
            df[column] = df[column].replace('u', 'purple')
            df[column] = df[column].replace('e', 'red')
            df[column] = df[column].replace('w', 'white')
            df[column] = df[column].replace('y', 'yellow')
        
        #bruises: bruises=t,no=f
        #declaramos a Bruises como dummies 
        elif column == 'bruises':
            df[column] = df[column].replace('t', 1)
            df[column] = df[column].replace('f', 0)
            
        #odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
        elif column == 'odor':
            df[column] = df[column].replace('a', 'almond')
            df[column] = df[column].replace('l', 'anise')
            df[column] = df[column].replace('c', 'creosote')
            df[column] = df[column].replace('y', 'fishy')
            df[column] = df[column].replace('f', 'foul')
            df[column] = df[column].replace('m', 'musty')
            df[column] = df[column].replace('n', 'none')
            df[column] = df[column].replace('p', 'pungent')
            df[column] = df[column].replace('s', 'spicy')
                    
        #gill-attachment: attached=a,descending=d,free=f,notched=n
        elif column == 'gill-attachment':
            df[column] = df[column].replace('a', 'attached')
            df[column] = df[column].replace('d', 'descending')
            df[column] = df[column].replace('f', 'free')
            df[column] = df[column].replace('n', 'notched')
        
        #gill-spacing: close=c,crowded=w,distant=d
        elif column == 'gill-spacing':
            df[column] = df[column].replace('c', 'close')
            df[column] = df[column].replace('w', 'crowded')
            df[column] = df[column].replace('d', 'distant')
        
        #gill-size: broad=b,narrow=n
        elif column == 'gill-size':
            df[column] = df[column].replace('b', 'broad')
            df[column] = df[column].replace('n', 'narrow')
        
        #gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
        elif column == 'gill-color':
            df[column] = df[column].replace('k', 'black')
            df[column] = df[column].replace('n', 'brown')
            df[column] = df[column].replace('b', 'buff')
            df[column] = df[column].replace('h', 'chocolate')
            df[column] = df[column].replace('g', 'gray')
            df[column] = df[column].replace('r', 'green')
            df[column] = df[column].replace('o', 'orange')
            df[column] = df[column].replace('p', 'pink')
            df[column] = df[column].replace('u', 'purple')
            df[column] = df[column].replace('e', 'red')
            df[column] = df[column].replace('w', 'white')
            df[column] = df[column].replace('y', 'yellow')
        
        #stalk-shape: enlarging=e,tapering=t
        elif column == 'stalk-shape':
            df[column] = df[column].replace('e', 'enlarging')
            df[column] = df[column].replace('t', 'tapering')
        
        #stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
        elif column == 'stalk-root':
            df[column] = df[column].replace('b', 'bulbous')
            df[column] = df[column].replace('c', 'club')
            df[column] = df[column].replace('u', 'cup')
            df[column] = df[column].replace('e', 'equal')
            df[column] = df[column].replace('z', 'rhizomorphs')
            df[column] = df[column].replace('r', 'rooted')
            df[column] = df[column].replace('?', 'missing')
            
        #stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
        elif column == 'stalk-surface-above-ring':
            df[column] = df[column].replace('f', 'fibrous')
            df[column] = df[column].replace('y', 'scaly')
            df[column] = df[column].replace('k', 'silky')
            df[column] = df[column].replace('s', 'smooth')
        
        #stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
        elif column == 'stalk-surface-below-ring':
            df[column] = df[column].replace('f', 'fibrous')
            df[column] = df[column].replace('y', 'scaly')
            df[column] = df[column].replace('k', 'silky')
            df[column] = df[column].replace('s', 'smooth')
        
        #stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
        elif column == 'stalk-color-above-ring':
            df[column] = df[column].replace('n', 'brown')
            df[column] = df[column].replace('b', 'buff')
            df[column] = df[column].replace('c', 'cinnamon')
            df[column] = df[column].replace('g', 'gray')
            df[column] = df[column].replace('o', 'orange')
            df[column] = df[column].replace('p', 'pink')
            df[column] = df[column].replace('e', 'red')
            df[column] = df[column].replace('w', 'white')
            df[column] = df[column].replace('y', 'yellow')
        
        #stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
        elif column == 'stalk-color-below-ring':
            df[column] = df[column].replace('n', 'brown')
            df[column] = df[column].replace('b', 'buff')
            df[column] = df[column].replace('c', 'cinnamon')
            df[column] = df[column].replace('g', 'gray')
            df[column] = df[column].replace('o', 'orange')
            df[column] = df[column].replace('p', 'pink')
            df[column] = df[column].replace('e', 'red')
            df[column] = df[column].replace('w', 'white')
            df[column] = df[column].replace('y', 'yellow')

        
        #veil-type: partial=p,universal=u
        elif column == 'veil-type':
            df[column] = df[column].replace('p', 'partial')
            df[column] = df[column].replace('u', 'universal')

        #veil-color: brown=n,orange=o,white=w,yellow=y
        elif column == 'veil-color':
            df[column] = df[column].replace('n', 'brown')
            df[column] = df[column].replace('o', 'orange')
            df[column] = df[column].replace('w', 'white')
            df[column] = df[column].replace('y', 'yellow')

        #ring-number: none=n,one=o,two=t
        elif column == 'ring-number':
            df[column] = df[column].replace('n', 'none')
            df[column] = df[column].replace('o', 'one')
            df[column] = df[column].replace('t', 'two')

        #ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
        elif column == 'ring-type':
            df[column] = df[column].replace('c', 'cobwebby')
            df[column] = df[column].replace('e', 'evanescent')
            df[column] = df[column].replace('f', 'flaring')
            df[column] = df[column].replace('l', 'large')
            df[column] = df[column].replace('n', 'none')
            df[column] = df[column].replace('p', 'pendant')
            df[column] = df[column].replace('s', 'sheathing')
            df[column] = df[column].replace('z', 'zone')
            
        #spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
        elif column == 'spore-print-color':
            df[column] = df[column].replace('k', 'black')
            df[column] = df[column].replace('n', 'brown')
            df[column] = df[column].replace('b', 'buff')
            df[column] = df[column].replace('h', 'chocolate')
            df[column] = df[column].replace('g', 'green')
            df[column] = df[column].replace('o', 'orange')
            df[column] = df[column].replace('u', 'purple')
            df[column] = df[column].replace('w', 'white')
            df[column] = df[column].replace('y', 'yellow')
        
        #population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
        elif column == 'population':
            df[column] = df[column].replace('a', 'abundant')
            df[column] = df[column].replace('c', 'clustered')
            df[column] = df[column].replace('n', 'numerous')
            df[column] = df[column].replace('s', 'scattered')
            df[column] = df[column].replace('v', 'several')
            df[column] = df[column].replace('y', 'solitary')

        #habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d
        elif column == 'habitat':
            df[column] = df[column].replace('g', 'grasses')
            df[column] = df[column].replace('l', 'leaves')
            df[column] = df[column].replace('m', 'meadows')
            df[column] = df[column].replace('p', 'paths')
            df[column] = df[column].replace('u', 'urban')
            df[column] = df[column].replace('w', 'waste')
            df[column] = df[column].replace('d', 'woods')
            

In [4]:
#Método para imprimir información básica de las columnas del dataframe
def getInfoByColumn(df):
    
    for column in df:
        
        InfoBasica = df[column].describe()
        
        uniqueValuesCount = len(df[column].unique())
        
        #si la columna tiene menos de 10 valores, los imprimimos sin problemas
        #si tiene más truncamos el texto para simplificar la lectura
        if (uniqueValuesCount < 10):
            
            ShowUnique = 'Show Unique  ' + str(df[column].unique()).strip('[]')
        else:
            ShowUnique = 'Show Unique  ' + str(df[column].unique()[0:30]).strip('[]') + ',etc...'
        
        print('Información columna: {} \n''---------------\n{}'.format(column, InfoBasica))
        print('{}''\n'.format(ShowUnique))

In [5]:
#Método para graficar un histograma por cada columna del dataset
def getHistogramByColumn(df):
    for column in df:

        #Gráfica Histograma:
        Histograma = df[column].hist(grid=False, color='indigo', bins=10, xlabelsize=10, xrot=45)
        
        #Título y nombre de ejes: 
        plt.xlabel(column, fontsize= 13, color='green')
        plt.ylabel('Freq.',fontsize= 13, color='green')
        plt.title('Columna: ' + column, fontsize= 20, color='mediumslateblue')
        
        plt.legend(labels=df[column],  loc='upper right', fontsize='small',bbox_to_anchor=(1.3, 1))
        plt.show()
        print (Histograma)

In [6]:
#Método para obtener datos estadísticos de cada columna del dataframe
def getStatisticForEachColumn(df):
    
    for column in df:
        
        STD = df[column].std()
        
        MEAN = df[column].mean()
        
        VAR =  df[column].var()
        
        print('Statistics mesures from:{}\n-----------------------------\nSTD:{}\nVAR: {}\nMean: {}\n'.format(column, STD, VAR, MEAN))

In [7]:
#Método para generar un gráfico de correlación de las variables del data frame
def plot_corr(df,size=10):
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    corr = df
    fig, ax = plt.subplots(figsize=(size, size),)
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns);
    plt.yticks(range(len(corr.columns)), corr.columns);

# Métodos relacionados al modelo KNN

In [8]:
#Método para obtener el hiperparámetro K más óptimo
#useStandarization determina si se aplica o no la estandarización a los valores de los datos 
#retorna un dataframe con el score para todos los K's, cuya cantidad es determinada por quantityK
def getScoresForHyperparameterK(quantityK, stepK, model_X_train, model_y_train, kFold_N_Splits=5, KFold_shuffle=True, useStandarization=False):
    kf = KFold(n_splits=kFold_N_Splits, shuffle=KFold_shuffle, random_state=12)

    scores_para_df = []
    
    #si viene configurado, se realiza la estandarización de los valores
    if(useStandarization):
        scaler = StandardScaler()
        model_X_train = scaler.fit_transform(model_X_train)

    for i in range(1, quantityK+1, stepK):

        # En cada iteración instanciamos el modelo con un hiperparámetro distinto
        model = KNeighborsClassifier(n_neighbors=i)

        # cross_val_scores nos devuelve un array de 5 resultados,
        # uno por cada partición que hizo automáticamente CV
        cv_scores = cross_val_score(model, model_X_train, model_y_train, cv=kf)

        # Para cada valor de n_neighbours, creo un diccionario con el valor
        # de n_neighbours y la media y el desvío de los scores.
        dict_row_score = {'score_medio':np.mean(cv_scores),\
                          'score_std':np.std(cv_scores), 'n_neighbours':i}

        # Guardo cada uno en la lista de diccionarios
        scores_para_df.append(dict_row_score)
    
    dfResult = pd.DataFrame(scores_para_df)
    dfResult['limite_inferior'] = dfResult['score_medio'] - dfResult['score_std']
    dfResult['limite_superior'] = dfResult['score_medio'] + dfResult['score_std']
    
    return dfResult
    

In [9]:
def getKNNPredictions(X_train, y_train, X_test, y_test, K, useStandarization = False):
    
    if(useStandarization):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train) 
        X_test = scaler.transform(X_test) 
    
    model = KNeighborsClassifier(n_neighbors=K)   
    
    print(model)
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    return y_pred, model.score(X_test, y_test)

# Metodos relacionados al modelo Regresion Logistica

In [10]:
#Método para obtener el hiperparámetro C más óptimo
#useStandarization determina si se aplica o no la estandarización a los valores de los datos
#retorna un dataframe con el score para todos los C's del array valoresPosiblesC
def getScoresForHyperparameterC(valoresPosiblesC, model_X_train, model_y_train, kFold_N_Splits=5, KFold_shuffle=True, useStandarization=False):
    kf = KFold(n_splits=kFold_N_Splits, shuffle=KFold_shuffle, random_state=12)

    scores_para_df = []
    
    #si viene configurado, se realiza la estandarización de los valores
    if(useStandarization):
        scaler = StandardScaler()
        model_X_train = scaler.fit_transform(model_X_train)

    for i in valoresPosiblesC:
        
        #para evitar el warning
        #C:\Users\User\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
        #se usa el parametro solver='lbfgs'
        model = linear_model.LogisticRegression(C=i, solver='lbfgs', class_weight='balanced')        
        cv_scores = cross_val_score(model, model_X_train, model_y_train, cv=kf)
        
        dict_row_score = {'score_medio':np.mean(cv_scores), 'score_std':np.std(cv_scores), 'C':i}        
        scores_para_df.append(dict_row_score)

    dfResult = pd.DataFrame(scores_para_df)
    dfResult['limite_inferior'] = dfResult['score_medio'] - dfResult['score_std']
    dfResult['limite_superior'] = dfResult['score_medio'] + dfResult['score_std']
    
    return dfResult

In [11]:
def getLogisticRegressionPredictions(X_train, y_train, X_test, y_test, C, useStandarization = False):
    
    if(useStandarization):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train) 
        X_test = scaler.transform(X_test) 
    
    #para evitar el warning
    #C:\Users\User\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
    #se usa el parametro solver='lbfgs'
    model = linear_model.LogisticRegression(C=C, solver='lbfgs', class_weight='balanced')   
    
    print(model)
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    return y_pred, model.score(X_test, y_test), model.coef_

# Metodos para Barnoulli Naive Bayes

In [12]:
#metodo para obtener el hiperparamatro alfa con el que mejor resulta el modelo Bernouilli Naive Bayes
#useStandarization determina si se aplica o no la estandarización a los valores de los datos
def getScoresForHypermarameterAlphaNB(valoresPosiblesAlpha, model_X_train, model_y_train, kFold_N_Splits=5, KFold_shuffle=True, useStandarization=False):
    kf = KFold(n_splits=kFold_N_Splits, shuffle=KFold_shuffle, random_state=12)

    scores_para_df = []
    
    #si viene configurado, se realiza la estandarización de los valores
    if(useStandarization):
        scaler = StandardScaler()
        model_X_train = scaler.fit_transform(model_X_train)

    for alpha in valoresPosiblesAlpha:
        
        
        model = BernoulliNB(alpha=alpha)
        cv_scores = cross_val_score(model, model_X_train, model_y_train, cv=kf)
        
        dict_row_score = {'score_medio':np.mean(cv_scores), 'score_std':np.std(cv_scores), 'Alpha':alpha}        
        scores_para_df.append(dict_row_score)

    dfResult = pd.DataFrame(scores_para_df)
    dfResult['limite_inferior'] = dfResult['score_medio'] - dfResult['score_std']
    dfResult['limite_superior'] = dfResult['score_medio'] + dfResult['score_std']
    
    return dfResult

In [13]:
def getBernoulliNaiveBayesPredictions(bestAlpha, X_train, y_train, X_test, y_test):
    model = BernoulliNB(alpha = bestAlpha)
    model.fit(X_train, y_train)
    
    print(model)
    
    y_pred = model.predict(X_test)
    
    return y_pred, model.score(X_test, y_test)

# Metodos para Multinomial Naive Bayes

In [14]:
#metodo para obtener el hiperparamatro alfa con el que mejor resulta el modelo Bernouilli Naive Bayes
#useStandarization determina si se aplica o no la estandarización a los valores de los datos
def getScoresForHypermarameterAlphaMNB(valoresPosiblesAlpha, model_X_train, model_y_train, kFold_N_Splits=5, KFold_shuffle=True, useStandarization=False):
    kf = KFold(n_splits=kFold_N_Splits, shuffle=KFold_shuffle, random_state=12)

    scores_para_df = []
    
    #si viene configurado, se realiza la estandarización de los valores
    if(useStandarization):
        scaler = StandardScaler()
        model_X_train = scaler.fit_transform(model_X_train)

    for alpha in valoresPosiblesAlpha:
        
        
        model = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)
        cv_scores = cross_val_score(model, model_X_train, model_y_train, cv=kf)
        
        dict_row_score = {'score_medio':np.mean(cv_scores), 'score_std':np.std(cv_scores), 'Alpha':alpha}        
        scores_para_df.append(dict_row_score)

    dfResult = pd.DataFrame(scores_para_df)
    dfResult['limite_inferior'] = dfResult['score_medio'] - dfResult['score_std']
    dfResult['limite_superior'] = dfResult['score_medio'] + dfResult['score_std']
    
    return dfResult

In [15]:
def getMultinomialNaiveBayesPredictions(bestAlpha, X_train, y_train, X_test, y_test):
    model = MultinomialNB(alpha = bestAlpha, fit_prior=True, class_prior=None)
    model.fit(X_train, y_train)
    
    print(model)
    
    y_pred = model.predict(X_test)
    
    return y_pred, model.score(X_test, y_test)

# Matriz de confusion

<table>
  <thead>
    <tr>
      <th>PREDICHOS</th>
      <th>0 (F)</th>
      <th>1 (V)</th>
    </tr>
      <tr>
      <th>REALES</th>
      <th></th>
      <th><th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>0 (F)</td>
      <td>TN</td>
      <td>FN</td>
    </tr>
    <tr>
      <td>1 (V)</td>
      <td>FP</td>
      <td>TN</td>
    </tr>
  </tbody>
</table>

TN: True Negative
FN: False Negative
FP: False Positive
TN: True Positive

In [16]:
#grafica la matriz de confusion y retorna los valores tn, fp, fn, tp de la misma
def getConfusionMatrix(y_test, y_pred, size=5):
    
    confusionMatrix = confusion_matrix(y_test, y_pred)
    
    fig, ax = plt.subplots(figsize=(size,size))   
    sns.heatmap(confusionMatrix, annot=True, fmt='d',linewidths=.5,cmap="Blues")
    plt.ylabel('Valores verdaderos')
    plt.xlabel('Valores predichos');
    
    return confusionMatrix.ravel()

# Métodos relacionados a métricas

##### Accuracy, para problemas que estén equiibrados y no sesgados

In [17]:
def getModelAccuracy(tn, fp, fn, tp):
    return (tp + tn) / (tn + fp + fn + tp)

##### Para tratar de capturar la mayor cantidad de positivos posibles (cuando para el modelo es necesario capturar también los falsos positivos)

In [18]:
def getModelRecall(tn, fp, fn, tp):
    return tp / (tp + fn)

In [19]:
def getModelRecall(y_test, y_pred):
    return recall_score(y_test, y_pred)

##### Para cuando se quiere estar muy seguro de una predicción positiva. Mide la capacidad del clasificador de no etiquetar como positiva una muestra que es negativa (1 es el mejor valor)

In [20]:
def getModelPrecision(tn, fp, fn, tp):
    return tp / (tp + fp)

In [21]:
def getModelPrecision(y_test, y_pred):
    return precision_score(y_test, y_pred)

##### Metrica F1, cuanto mayor es su valor, mejor es el modelo

In [22]:
def getModelMetricF1(tn, fp, fn, tp):
    precision = getModelPrecision(tn, fp, fn, tp)
    recall = getModelRecall(tn, fp, fn, tp)
    
    return (2 * precision * recall) / (precision + recall)

##### Metrica F beta, similar a F1, pero puede regularse la importancia de cada termino mediante el valor de beta. SI beta > 1 => favorece al recall. Si beta < 1 => favorece a la precision

In [23]:
def getModelMetricFBeta(betaCoeficient, tn, fp, fn, tp):
   
    precision = getModelPrecision(tn, fp, fn, tp)
    recall = getModelRecall(tn, fp, fn, tp)
    
    return (1 + betaCoeficient * betaCoeficient) * (precision * recall) / (betaCoeficient * betaCoeficient * precision + recall)

### Curva ROC

##### Sensitivity (True Positive Rate)

In [24]:
def getTruePositiveRatio(tn, fp, fn, tp):
    return tp / (tp + fn)

##### Specificity (False Positive Rate)

In [25]:
def getFalsePositiveRatio(tn, fp, fn, tp):
    return fp / (fp + tn)

##### Ploteo de curvas ROC

In [26]:
#imprime la curva ROC para un modelo en particular
def plotSingleROC_Curve(modelName, y_test, y_prob):
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    
    plt.figure(figsize=(5,5))
    plt.title(modelName + 'Receiver Operating Characteristic Curve')
    plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],linestyle='--')
    plt.axis('tight')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

In [27]:
#imprime las curvas ROC para varios modelos
#modelsDataDictionaryArray es un array de diccionarios con la siguiente forma
#{"y_prob":<datos predichos por el modelo>, "modelLabel":<etiqueta para identificar el modelo en el gráfico>}
def plotMultipleROC_Curve(y_test, modelsDataDictionaryArray):
    
    plt.figure(figsize=(10,10))
    plt.title('Receiver Operating Characteristic Curve')
   
           
    for dictionary in modelsDataDictionaryArray:
        false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, dictionary["y_prob"])
        roc_auc = auc(false_positive_rate, true_positive_rate)
        label = dictionary["modelLabel"] + ' - AUC = %0.2f '
        plt.plot(false_positive_rate, true_positive_rate, label = label % roc_auc)    

    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],linestyle='--')
    plt.axis('tight')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')