In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Pre-Funciones

In [None]:
import graphviz, IPython
import matplotlib.lines as lines
from matplotlib.ticker import FuncFormatter
from sklearn.tree import export_graphviz

def draw_tree(tree, df):
    s = export_graphviz(tree, out_file=None, feature_names=df.columns, filled=True)
    return graphviz.Source(s)

In [None]:
from sklearn import metrics as metrics
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix

def metricas(y_train,y_pred_train,y_test,y_pred_test):
    valores=y.value_counts().index.to_list()
    
    # Matriz de confusion: Train
    cm_train=metrics.confusion_matrix(y_train,y_pred_train,labels=valores)
    df_cm=pd.DataFrame(cm_train,index=valores,columns=valores)
    plt.figure(figsize=(8,5))
    sns.heatmap(df_cm,annot=True,cmap="YlGnBu")
    plt.title('Matriz de Confusión: Train')
    plt.xlabel('Predicción')
    plt.ylabel('Valores Reales')
    plt.show()
    
    # Matriz de confusion: Test
    cm_test=metrics.confusion_matrix(y_test,y_pred_test,labels=valores)
    df_cm=pd.DataFrame(cm_test,index=valores,columns=valores)
    plt.figure(figsize=(8,5))
    sns.heatmap(df_cm,annot=True,cmap="YlGnBu")
    plt.title('Matriz de Confusión: Test')
    plt.xlabel('Predicción')
    plt.ylabel('Valores Reales')
    plt.show()
    
    accuracy_train=metrics.accuracy_score(y_train,y_pred_train)
    accuracy_test=metrics.accuracy_score(y_test,y_pred_test)
    precision_train=metrics.precision_score(y_train,y_pred_train,average='micro')
    precision_test=metrics.precision_score(y_test,y_pred_test,average='micro')
    recall_train=metrics.recall_score(y_train,y_pred_train,average='micro')
    recall_test=metrics.recall_score(y_test,y_pred_test,average='micro')
    f_score=f1_score(y_test,y_pred_test,average='micro')
    
    train = (accuracy_train*100, precision_train*100, recall_train*100)
    test = (accuracy_test*100, precision_test*100, recall_test*100)

    ind = np.arange(3)  # the x locations for the groups
    ind_n = np.arange(4)  # the x locations for the groups
    width = 0.3       # the width of the bars
    
    fig = plt.figure(figsize = (8,5))
    ax = fig.add_subplot(111)
    
    rects1 = ax.bar(ind, train, width, color='r')
    rects2 = ax.bar(ind+width, test, width, color='g')
    rects3 = ax.bar(3, f_score*100, width, color='b')
    
    ax.set_ylabel('Scores')
    ax.set_xticks(ind_n + width/2)
    ax.set_xticklabels( ('Accuracy', 'Precisión', 'Recall', 'F1 Score') )
    ax.legend( (rects1[0], rects2[0]), ('Train', 'Test') )
    
    def autolabel(rects):
        for rect in rects:
            h = rect.get_height()
            ax.text(rect.get_x()+rect.get_width()/2., 1.00*h, '%.3f'%round(h,3),
                    ha='center', va='bottom')

    autolabel(rects1)
    autolabel(rects2)
    autolabel(rects3)
    plt.title('Puntajes')
    plt.ylim(0,120)
    plt.show()
    
    return 

# Cargando Datos

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as px

In [None]:
df = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')
df.head()

# 1. Análisis Exploratorio de Datos

In [None]:
print("Tenemos ",len(df.columns.to_list())," características")
print()
print(df.columns.to_list())

Todas nuestras variables son categóricas, esto es favorable para un Random Forest y Catboosting.

In [None]:
df.dtypes

In [None]:
#print(df.columns.to_list())
for i in df.drop('class',axis=1).columns.to_list():
    sns.countplot(x=i, data=df, hue='class')
    plt.show()

# 2. Limpieza de Datos

## 2.1. Datos Nulos

El DataFrame no contiene datos nulos.

In [None]:
df.isnull().sum()

## 2.2. Datos irrelevantes

### veil-type

Vemos que la variables solo contiene un valor, por lo que ya que todas las filas están categorizadas de esta forma, no aporta ninguna información

In [None]:
df['veil-type'].unique()

In [None]:
df_clean = df.drop('veil-type',axis=1).copy()

# 3. Ingeniería de Datos

## 3.1. Selección de Características

### Filter Method -> Chi-Squared

In [None]:
from scipy.stats import chi2_contingency

for i in df_clean.columns.to_list():
    crossTab = pd.crosstab(index=df_clean['class'], columns=df[i])
    crossTab
    
    print(i)

    stat, p, dof, expected = chi2_contingency([crossTab.iloc[0].values,crossTab.iloc[1].values])# select significance value
    alpha = 0.05# Determine whether to reject or keep your null hypothesis
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        print('Las variables están asociadas(Se rechaza H0)')
    else:
        print('Las variables no están asociadas(No se rechaza H0)')

## 3.2. Conversión de Características a numéricas

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_final_label = pd.DataFrame()

for i in df_clean.columns.to_list():
    le.fit(df_clean[i])
    df_final_label[i] = le.transform(df_clean[i])

df_final_label.head()

# 4. Entrenamiento y Validación

In [None]:
X = df_final_label.drop('class', axis=1)
y = df_final_label['class']

## 4.1. Partición Muestral

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [None]:
X_train.head()

## 4.2. Algoritmos de Machine Learning

### Árbol de Decisión

In [None]:
from sklearn.tree import DecisionTreeClassifier
arbolDecision = DecisionTreeClassifier()
arbolDecision.fit(X_train, y_train)
y_pred_train = arbolDecision.predict(X_train)
y_pred_test = arbolDecision.predict(X_test)
metricas(y_train,y_pred_train,y_test,y_pred_test)

In [None]:
draw_tree(arbolDecision, X_train)

In [None]:
from catboost import CatBoostClassifier
catBoost = CatBoostClassifier()
catBoost.fit(X_train, y_train)
y_pred_train = catBoost.predict(X_train)
y_pred_test = catBoost.predict(X_test)
metricas(y_train,y_pred_train,y_test,y_pred_test)

# Conclusiones

Tenemos dos potentes algoritmos, pero ¿Cuál debemos elegir? Si bien los dos nos brindan potentes puntajes, con el CatBoost al ser un algoritmo complejo, perdemos interpretabilidad, pero con el árbol de decisión obtenemos interpretabilidad, osea podemos saber como se llego al resultado de predecir un hongo venenoso de un hongo comestible.