In [1]:
#### Paquetes

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os

from sklearn import tree
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")

# Random Forests

A continuación vamos a ver uno de los algoritmos de clasificación/regresión más populares, que sin ser de los más avanzados, como redes neuronales/xgboost, permite buenos resultados, basados en la aleatorización de árboles de decisión binarios, tanto por filas (registros) como por columnas (variables). El conjunto de árboles, llamado bosque, permite un poder predictivo muy alto a la vez que evita el sobreajuste.

In [3]:
dfcomplete = pd.read_csv(os.getcwd() + "\\data\\2008_small.csv", nrows = 200000)

dfcomplete = dfcomplete.dropna(subset = ['AirTime','Distance','TaxiIn','TaxiOut',"DepDelay","ArrDelay"])
df = dfcomplete.sample(frac=1).head(30000)

X = df[['AirTime','Distance','TaxiIn','TaxiOut',"DepDelay"]]
Y = df["ArrDelay"] > 0

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=1)

np.mean(y_test) # 43.33 % vuelos tarde // 56.67 % vuelos pronto

0.42916666666666664

#### Caso sencillo. Árbol de clasificación
Un solo árbol, determinista para hacer la predicción

In [12]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
np.mean(predictions == y_test)

0.7986666666666666

#### Random forest

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [13]:
clf = RandomForestClassifier(n_estimators=1000, max_depth = None, n_jobs= -1)
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
np.mean(predictions == y_test)

0.85

Clasificar la importancia de las variables

In [14]:
X.columns

Index(['AirTime', 'Distance', 'TaxiIn', 'TaxiOut', 'DepDelay'], dtype='object')

In [15]:
clf.feature_importances_

array([0.14062443, 0.156222  , 0.08850011, 0.17407964, 0.44057383])

In [16]:
clf.feature_importances_.sum()

0.9999999999999998

Ranking de variables

In [17]:
X.columns[np.argsort(-clf.feature_importances_)]

Index(['DepDelay', 'TaxiOut', 'Distance', 'AirTime', 'TaxiIn'], dtype='object')

In [18]:
pd.DataFrame({"Col":X.columns,"Imp":clf.feature_importances_}).sort_values("Imp", ascending = False)

Unnamed: 0,Col,Imp
4,DepDelay,0.440574
3,TaxiOut,0.17408
1,Distance,0.156222
0,AirTime,0.140624
2,TaxiIn,0.0885


In [10]:
clf.predict_proba(X_test).round(3)

array([[0.802, 0.198],
       [0.04 , 0.96 ],
       [0.678, 0.322],
       ...,
       [0.816, 0.184],
       [0.899, 0.101],
       [0.995, 0.005]])

### Ajuste automático del random forest

Una optimización de parámetros más eficiente que en los ejemplos anteriores

In [22]:
# EJECUTAR MEJOR SIN n_estimators = 200 y max_depth = 50
parameters = {'bootstrap':(True,False), 
              'n_estimators':[50,200, 500],
              'max_depth':[5,10,20,None], 
              'max_features':[1,3] #Más útil con datasets grandes
             }

clf = RandomForestClassifier(n_jobs= -1)
clfcv = GridSearchCV(clf, parameters,verbose=3,scoring = "accuracy")
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
clfcv.fit(X, Y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END bootstrap=True, max_depth=5, max_features=1, n_estimators=50;, score=0.830 total time=   0.0s
[CV 2/5] END bootstrap=True, max_depth=5, max_features=1, n_estimators=50;, score=0.837 total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=5, max_features=1, n_estimators=50;, score=0.829 total time=   0.0s
[CV 4/5] END bootstrap=True, max_depth=5, max_features=1, n_estimators=50;, score=0.828 total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=5, max_features=1, n_estimators=50;, score=0.838 total time=   0.0s
[CV 1/5] END bootstrap=True, max_depth=5, max_features=1, n_estimators=200;, score=0.830 total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=5, max_features=1, n_estimators=200;, score=0.842 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=5, max_features=1, n_estimators=200;, score=0.830 total time=   0.2s
[CV 4/5] END bootstrap=True, max_depth=5, max_features=1, n_estimators=200;, sc

[CV 2/5] END bootstrap=True, max_depth=20, max_features=3, n_estimators=50;, score=0.849 total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=20, max_features=3, n_estimators=50;, score=0.839 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=20, max_features=3, n_estimators=50;, score=0.837 total time=   0.1s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=3, n_estimators=50;, score=0.843 total time=   0.1s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=3, n_estimators=200;, score=0.847 total time=   0.4s
[CV 2/5] END bootstrap=True, max_depth=20, max_features=3, n_estimators=200;, score=0.852 total time=   0.4s
[CV 3/5] END bootstrap=True, max_depth=20, max_features=3, n_estimators=200;, score=0.840 total time=   0.4s
[CV 4/5] END bootstrap=True, max_depth=20, max_features=3, n_estimators=200;, score=0.840 total time=   0.4s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=3, n_estimators=200;, score=0.840 total time=   0.5s
[CV 1/5] END bootstrap=

[CV 3/5] END bootstrap=False, max_depth=10, max_features=1, n_estimators=50;, score=0.837 total time=   0.0s
[CV 4/5] END bootstrap=False, max_depth=10, max_features=1, n_estimators=50;, score=0.834 total time=   0.0s
[CV 5/5] END bootstrap=False, max_depth=10, max_features=1, n_estimators=50;, score=0.842 total time=   0.0s
[CV 1/5] END bootstrap=False, max_depth=10, max_features=1, n_estimators=200;, score=0.839 total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=10, max_features=1, n_estimators=200;, score=0.848 total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=10, max_features=1, n_estimators=200;, score=0.838 total time=   0.2s
[CV 4/5] END bootstrap=False, max_depth=10, max_features=1, n_estimators=200;, score=0.835 total time=   0.2s
[CV 5/5] END bootstrap=False, max_depth=10, max_features=1, n_estimators=200;, score=0.844 total time=   0.2s
[CV 1/5] END bootstrap=False, max_depth=10, max_features=1, n_estimators=500;, score=0.839 total time=   0.6s
[CV 2/5] END 

[CV 3/5] END bootstrap=False, max_depth=None, max_features=3, n_estimators=50;, score=0.829 total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=None, max_features=3, n_estimators=50;, score=0.829 total time=   0.2s
[CV 5/5] END bootstrap=False, max_depth=None, max_features=3, n_estimators=50;, score=0.830 total time=   0.1s
[CV 1/5] END bootstrap=False, max_depth=None, max_features=3, n_estimators=200;, score=0.832 total time=   0.8s
[CV 2/5] END bootstrap=False, max_depth=None, max_features=3, n_estimators=200;, score=0.839 total time=   0.7s
[CV 3/5] END bootstrap=False, max_depth=None, max_features=3, n_estimators=200;, score=0.826 total time=   0.6s
[CV 4/5] END bootstrap=False, max_depth=None, max_features=3, n_estimators=200;, score=0.830 total time=   0.7s
[CV 5/5] END bootstrap=False, max_depth=None, max_features=3, n_estimators=200;, score=0.830 total time=   0.6s
[CV 1/5] END bootstrap=False, max_depth=None, max_features=3, n_estimators=500;, score=0.831 total time=   

In [20]:
clfcv.cv_results_

{'mean_fit_time': array([0.31371508, 0.07367129, 0.19698901, 0.0436398 , 0.06926322,
        0.21119227, 0.04143782, 0.07867193, 0.26013861, 0.03303022,
        0.06626015, 0.19267578, 0.04163756, 0.08059993, 0.26778669,
        0.05224729, 0.11150146, 0.33390431, 0.03763423, 0.07606912,
        0.23106875, 0.05605097, 0.11229658, 0.37207232, 0.0763103 ,
        0.16214743, 0.51476932, 0.04844379, 0.09828944, 0.35830131,
        0.07215953, 0.14352102, 0.49935718, 0.09808917, 0.1989656 ,
        0.69221401, 0.04664259, 0.09528666, 0.39455957, 0.07687016,
        0.15845008, 0.54415174, 0.09709358, 0.20378566, 0.73423457,
        0.03182907, 0.06549873, 0.19311929, 0.03643332, 0.07786903,
        0.2381011 , 0.04744182, 0.09308462, 0.30467706, 0.03122835,
        0.06365786, 0.19184456, 0.04644256, 0.08998356, 0.3063961 ,
        0.06325784, 0.12451329, 0.43484821, 0.04444051, 0.0786716 ,
        0.26286449, 0.07186041, 0.14717684, 0.48731971, 0.10489531,
        0.21099181, 0.70575438,

In [23]:
clfcv.best_params_

{'bootstrap': True, 'max_depth': None, 'max_features': 1, 'n_estimators': 200}

Ajustar el mejor modelo, con todos los datos:

In [24]:
X = dfcomplete[['AirTime','Distance','TaxiIn','TaxiOut',"DepDelay"]]
Y = dfcomplete["ArrDelay"] > 0

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2)


clf = RandomForestClassifier(n_estimators = clfcv.best_params_["n_estimators"], 
                             bootstrap = clfcv.best_params_["bootstrap"], 
                             max_depth = clfcv.best_params_["max_depth"], 
                             max_features= clfcv.best_params_["max_features"],
                             n_jobs = -1)

clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
np.mean(predictions == y_test)

0.8526081733841082

Ahora estoy convencido que el modelo funciona.

#### Exportar un modelo al disco

In [25]:
# Entrenar con TODOS los datos

clf = RandomForestClassifier(n_estimators = clfcv.best_params_["n_estimators"], 
                             bootstrap = clfcv.best_params_["bootstrap"], 
                             max_depth = clfcv.best_params_["max_depth"],
                             max_features= clfcv.best_params_["max_features"],
                             n_jobs = -1)

clf.fit(X,Y)

# Guardar el modelo en disco
filename = 'mirandomforest.sav'
pickle.dump(clf, open(filename, 'wb'))
 
print("Modelo Guardado")

del(clf)
    
print("Modelo Borrado del Jupyter Notebook")

Modelo Guardado
Modelo Borrado del Jupyter Notebook


#### Cargar el modelo
Y lo usamos con nuevos datos que entramos directamente

In [28]:
modelo = pickle.load(open('mirandomforest.sav', 'rb'))

Crearemos un objeto vacío aprovechando el objeto X

#### Opción 1. Input

In [29]:
newflight = pd.DataFrame(columns=X.columns, index=range(1))
newflight

Unnamed: 0,AirTime,Distance,TaxiIn,TaxiOut,DepDelay
0,,,,,


In [30]:
newflight = pd.DataFrame(columns=['AirTime','Distance','TaxiIn','TaxiOut',"DepDelay"], index=range(1))

newflight["AirTime"] = int(input("Airtime"))
newflight["Distance"] = int(input("Distance"))
newflight["TaxiIn"] = int(input("TaxiIn"))
newflight["TaxiOut"] = int(input("TaxiOut"))
newflight["DepDelay"] = int(input("DepDelay"))

print("Prediccion:",modelo.predict_proba(newflight)[0])

Airtime60
Distance200
TaxiIn4
TaxiOut0
DepDelay30
Prediccion: [0.045 0.955]


#### Opción 2. GUI

In [19]:
# ! pip install tk

In [31]:
from tkinter import *
from tkinter.ttk import *
from tkinter import messagebox

In [32]:
def validate():
    global t, d, ti, to, r
    
    newflight = pd.DataFrame(columns=['AirTime','Distance','TaxiIn','TaxiOut',"DepDelay"], index=range(1))

    t, d, ti, to, r = tiempo_value.get(), distancia_value.get(), taxiin_value.get(), taxiout_value.get(), retraso_value.get()
    newflight["AirTime"] = int(t)
    newflight["Distance"] = int(d)
    newflight["TaxiIn"] = int(ti)
    newflight["TaxiOut"] = int(to)
    newflight["DepDelay"] = int(r)
    
    print("prediccion",modelo.predict(newflight)[0])
    
    if modelo.predict(newflight)[0]:
        messagebox.showinfo("PREDICCION","Vuelo con retraso esperado")
    else:
        messagebox.showinfo("PREDICCION","Vuelo sin retraso esperado")

    return ""

In [33]:
window = Tk()

window.title("RandomForest")
window.geometry('350x700')

tiempo = Label(window, text="Tiempo de vuelo",font=("Arial Bold", 30))
tiempo.grid(column=0, row=6)

tiempo_value = Entry(window,width=30)
tiempo_value.grid(column=0, row=7,pady=5)

distancia = Label(window, text="Distancia",font=("Arial Bold", 30))
distancia.grid(column=0, row=8)

distancia_value = Entry(window,width=30)
distancia_value.grid(column=0, row=9,pady=5)


taxiin = Label(window, text="Taxi In",font=("Arial Bold", 30))
taxiin.grid(column=0, row=10)

taxiin_value = Entry(window,width=30)
taxiin_value.grid(column=0, row=11,pady=5)


taxiout = Label(window, text="Taxi Out",font=("Arial Bold", 30))
taxiout.grid(column=0, row=12)

taxiout_value = Entry(window,width=30)
taxiout_value.grid(column=0, row=13,pady=5)

retraso = Label(window, text="Retraso de Salida",font=("Arial Bold", 30))
retraso.grid(column=0, row=14)

retraso_value = Entry(window,width=30)
retraso_value.grid(column=0, row=15,pady=5)

btn = Button(window, text="Predecir",command = validate)
btn.grid(column=0, row=18)

window.mainloop()

prediccion True


# Ejercicios

Explora el dataframe breastCancer.csv, el de cars o el de iris (lo importante aquí es la técnica, no los datos concretos)

https://www.kaggle.com/datasets/nancyalaswad90/breast-cancer-dataset?resource=download&select=data.csv

In [2]:
df = pd.read_csv("data\\breastCancer.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [24]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave_points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

Añade algunas variables adicionales que puedan servirnos posteriormente. ¿Con qué criterio lo podemos hacer, si desconocemos el campo?

**Ajusta un modelo predictivo para anticipar el diagnóstico.** 

Utiliza una estrategia de validación que te permita estar seguro que estamos haciendo unas buenas predicciones. Prueba primero algunos ejemplos simples de modelos, que te permitan seleccionar cuales pueden funcionar mejor.

Mejora tanto como puedas el modelo, usando varias de las técnicas que hemos trabajado (Transformaciones, ingeniería de variables, gridsearch...)

¿Qué variables son las más útiles para hacer un buen diagnóstico?

Crea una aplicación con GUI que, usando las variables más relevantes, te de una predicción. ¿Qué harías con las menos relevantes?

##### © Netmind S.L.

Todos los derechos reservados. Este documento (v1.00) ha sido diseñado para el uso exclusivo del cliente que atiende a esta formación.

Ninguna parte de este documento puede ser reproducida, distribuida o transmitida en cualquier forma o por cualquier medio sin el permiso previo por escrito de Netmind.