In [239]:
# Standard Imports
import pandas as pd
import seaborn as sns
from seaborn import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import pickle
from IPython.display import display, Markdown
import scipy.stats as st

# Transformers
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler,Normalizer,MaxAbsScaler,PolynomialFeatures
from sklearn.impute import SimpleImputer

# Modeling Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, confusion_matrix, classification_report,roc_auc_score

# Pipelines
from sklearn import set_config
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

# Machine Learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [11]:
pd.options.display.precision = 4
pd.options.mode.chained_assignment = None  

In [12]:
def calculate_roc_auc(model_pipe, X, y):
    """Calculate roc auc score. 
    
    Parameters:
    ===========
    model_pipe: sklearn model or pipeline
    X: features
    y: true target
    """
    y_proba = model_pipe.predict_proba(X)[:,1]
    return roc_auc_score(y, y_proba)

# Creating evaluation function to plot a confusion matrix and return the accuracy, precision, recall, and f1 scores
def evaluation(y, y_hat, title = 'Confusion Matrix'):
    cm = confusion_matrix(y, y_hat)
    precision = precision_score(y, y_hat)
    recall = recall_score(y, y_hat)
    accuracy = accuracy_score(y,y_hat)
    f1 = f1_score(y,y_hat)
    print('Recall: ', recall)
    print('Accuracy: ', accuracy)
    print('Precision: ', precision)
    print('F1: ', f1)
    sns.heatmap(cm,  cmap= 'PuBu', annot=True, fmt='g', annot_kws=    {'size':20})
    plt.xlabel('predicted', fontsize=18)
    plt.ylabel('actual', fontsize=18)
    plt.title(title, fontsize=18)
    
    plt.show()

In [15]:
set_config(display="diagram")

In [218]:
# Load data
columns = ['alive', 'class', 'embarked', 'who', 'alone', 'adult_male']
df = load_dataset('titanic').drop(columns=columns)
df['deck'] = df['deck'].astype('object')
df.dropna(inplace=True)
print(df.shape)
df.head()

(182, 9)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,deck,embark_town
1,1,1,female,38.0,1,0,71.2833,C,Cherbourg
3,1,1,female,35.0,1,0,53.1,C,Southampton
6,0,1,male,54.0,0,0,51.8625,E,Southampton
10,1,3,female,4.0,1,1,16.7,G,Southampton
11,1,1,female,58.0,0,0,26.55,C,Southampton


In [219]:
SEED = 42
TARGET = 'survived'
FEATURES = df.columns.drop(TARGET)

NUMERICAL = df[FEATURES].select_dtypes('number').columns
print(f"Numerical features: {', '.join(NUMERICAL)}")

CATEGORICAL = pd.Index(np.setdiff1d(FEATURES, NUMERICAL))
print(f"Categorical features: {', '.join(CATEGORICAL)}")

Numerical features: pclass, age, sibsp, parch, fare
Categorical features: deck, embark_town, sex


In [220]:
X = df.drop(['survived'], axis=1)
y = df.survived
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [221]:
# Split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(
    X,  
    y, 
    test_size=1/3,
    random_state=0)
 
print(X_train.shape)
print(X_test.shape)

(121, 8)
(61, 8)


Particionaremos los datos primero

In [222]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,deck,embark_town
645,1,male,48.0,1,0,76.7292,D,Cherbourg
712,1,male,48.0,1,0,52.0,C,Southampton
462,1,male,47.0,0,0,38.5,E,Southampton
679,1,male,36.0,0,1,512.3292,B,Cherbourg
170,1,male,61.0,0,0,33.5,B,Southampton


In [223]:
X_test.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,deck,embark_town
484,1,male,25.0,1,0,91.0792,B,Cherbourg
110,1,male,47.0,0,0,52.0,C,Southampton
195,1,female,58.0,0,0,146.5208,B,Cherbourg
496,1,female,54.0,1,0,78.2667,D,Cherbourg
889,1,male,26.0,0,0,30.0,C,Cherbourg


 y preprocesaremos los datos utilizando los transformadores de Scikit-learn para evitar la fuga de datos mediante el preprocesamiento correcto:

In [224]:
numerical_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean')),
    #('selector', VarianceThreshold()),
    ("polynomial", PolynomialFeatures())
])

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False))
    #,('onehot', OneHotEncoder(handle_unknown='ignore'))
],)

preprocessors = ColumnTransformer(transformers=[
    ('num', numerical_pipe, NUMERICAL),
    ('cat', categorical_pipe, CATEGORICAL)
])

pipe = Pipeline([
    ('preprocessors', preprocessors),
    ('classifier', DecisionTreeClassifier()) ## colocar cualquier modelo, al final va mandar el grid serch
])

In [225]:
#pipe.get_params() ## Me sirve para identificar los nombres de los parametros

In [226]:
other_params = {"preprocessors__num__scaler": [StandardScaler(), MinMaxScaler(),Normalizer(), MaxAbsScaler()],
                "preprocessors__num__polynomial__degree": [1, 2, 3],
                "preprocessors__num__imputer__strategy":['mean','most_frequent','median'],
                "preprocessors__cat__imputer__strategy":['most_frequent','constant'],
                "preprocessors__num__polynomial__include_bias":[True,False]
                }

In [227]:
# este diccionario de parámetros lo vamos a usar tanto para XGBoost como para LightGBM
params = {  
    "classifier__n_estimators": np.random.randint(20,150,2), # n. de árboles.
    "classifier__max_depth": np.random.randint(3,12,2) ,     # máxima profundiad del árbol
    "classifier__learning_rate": [0.1,0.5,0.9], # learning rate (xgb’s “eta”)
    "classifier__colsample_bytree": [1,0.8] , # ratio de la muestra de las columnas a la hora de construir el árbol
    "classifier__subsample": [1,0.8],     # ratio de la muestra de las observaciones
    "classifier__min_child_weight": [0.5,5,10,20] # Suma mínima del peso de la instancia (hessiano) requerida en los hijos
}

In [243]:
search_space = [
                {"classifier": [XGBClassifier(n_jobs=3, eval_metric="logloss", use_label_encoder=False)], 
                 **params,
                 **other_params,
                "classifier__gamma": np.random.uniform(0, 10,2), # reducción mínima de la pérdida requerida para hacer un split
                },
    
                {"classifier": [DecisionTreeClassifier()],
                 #**params,
                 **other_params
                },
    
                {"classifier": [RandomForestClassifier(n_jobs=3)],
                 #**params,
                 **other_params},
    
                {"classifier": [AdaBoostClassifier()],
                 #**params,
                 **other_params
                },
    
                {"classifier": [GradientBoostingClassifier()],
                # **params,
                 **other_params},
    
               {"classifier": [LogisticRegression(max_iter=1000)],
                "classifier__C": [10**x for x in range(-5, 3)]
               }


]

In [244]:
grid_ramdon = RandomizedSearchCV(pipe, search_space, n_iter=100, cv=10,error_score='raise',scoring='roc_auc')

In [245]:
grid_ramdon_fit = grid_ramdon.fit(X_train, y_train)

In [246]:
results = pd.DataFrame(grid_ramdon.cv_results_)

In [248]:
results.head(100)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessors__num__scaler,param_preprocessors__num__polynomial__include_bias,param_preprocessors__num__polynomial__degree,param_preprocessors__num__imputer__strategy,param_preprocessors__cat__imputer__strategy,param_classifier__subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0189,0.0045,0.0041,1.1566e-03,MaxAbsScaler(),True,3,median,most_frequent,0.8,...,0.5000,0.5000,0.5000,0.5000,0.5000,0.5000,0.5000,0.5000,0.0000,66
1,0.0202,0.0017,0.0041,1.3776e-03,Normalizer(),False,2,median,constant,0.8,...,0.8438,0.6875,0.7188,0.7188,0.7812,0.6250,0.8125,0.7756,0.0942,18
2,0.0229,0.0018,0.0035,1.0291e-04,StandardScaler(),True,1,most_frequent,most_frequent,0.8,...,0.9375,0.9219,0.5938,0.5000,0.5938,0.8594,0.8750,0.7781,0.1631,14
3,0.0225,0.0011,0.0036,2.3734e-04,MinMaxScaler(),False,3,most_frequent,constant,0.8,...,0.9375,0.9219,0.5469,0.5312,0.6562,0.8125,0.7969,0.7612,0.1425,32
4,0.0192,0.0024,0.0036,2.7908e-04,MinMaxScaler(),True,2,most_frequent,most_frequent,1,...,0.9062,0.9062,0.6094,0.5000,0.5938,0.8594,0.9219,0.7797,0.1608,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0142,0.0017,0.0043,1.8842e-03,MaxAbsScaler(),False,2,mean,constant,0.8,...,0.5000,0.5000,0.5000,0.5000,0.5000,0.5000,0.5000,0.5000,0.0000,66
96,0.0159,0.0017,0.0037,7.3809e-04,MinMaxScaler(),True,2,mean,most_frequent,0.8,...,0.9688,0.8906,0.5156,0.5156,0.5000,0.8594,0.9219,0.7609,0.1798,33
97,0.0182,0.0017,0.0034,5.3963e-05,MinMaxScaler(),True,2,mean,constant,1,...,0.5000,0.5000,0.5000,0.5000,0.5000,0.5000,0.5000,0.5000,0.0000,66
98,0.0644,0.0053,0.0038,5.1889e-04,Normalizer(),True,3,mean,most_frequent,1,...,0.9219,0.8906,0.6250,0.4219,0.5000,0.7188,0.8750,0.7453,0.1772,49


In [249]:
grid_search = GridSearchCV(pipe, search_space, verbose=1, cv=3, scoring='accuracy')
grid_fit=grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 55880 candidates, totalling 167640 fits


La canalización:

+ Divide los datos de entrada en grupos numéricos y categóricos

+ Preprocesa ambos grupos en paralelo

+ Concatena los datos preprocesados ​​de ambos grupos

+ Pasa los datos preprocesados ​​al modelo

Necesitamos preprocesar el conjunto de datos de prueba de la misma manera antes de evaluar:

In [None]:
print('Training set score: ' + str(grid.score(X_train, y_train)))
print('Test set score: ' + str(grid.score(X_test, y_test)))

In [None]:
# Access the best set of parameters
best_params = grid.best_params_
print(best_params)
# Stores the optimum model in best_pipe
best_pipe = grid.best_estimator_
print(best_pipe)

In [None]:
result_df = DataFrame.from_dict(grid.cv_results_, orient='columns')
print(result_df.columns)

In [None]:
sns.relplot(data=result_df,
	kind='line',
	x='param_classifier__n_neighbors',
	y='mean_test_score',
	hue='param_scaler',
	col='param_classifier__p')
plt.show()

In [None]:
sns.relplot(data=result_df,
            kind='line',
            x='param_classifier__n_neighbors',
            y='mean_test_score',
            hue='param_scaler',
            col='param_classifier__leaf_size')
plt.show()

In [None]:
plt.figure(figsize=(10,7))
feat_importances = pd.Series(grid_fit.feature_importances_, index = train_x.columns)
feat_importances.nlargest(7).plot(kind='barh');

In [None]:
final_pipeline = GridSearchCV(Pipeline(steps=[
        ('preprocess', preprocess),
        ('classifier', RandomForestClassifier())
    ]), rand_forest_parms, cv=KFold())
#Fit and predict on train data
final_pipeline.fit(X_train, y_train)
train_pred = final_pipeline.best_estimator_.predict(X_train)
print('Evaluation on training data \n')
print(evaluation(y_train, train_pred))
print('\n')
#Predict on test data
test_pred = final_pipeline.best_estimator_.predict(X_test)
print('Evaluation on testing data \n')
print(evaluation(y_test, test_pred))