# **Ejercicios de selección de candidatos**

## 2. Clasificación 
Para el dataset del titanic (data/titanic)
- Selecciona los 3 mejores modelos candidatos para este problema de clasificación basándote en su rendimiento (usando las métricas de precisión y F1-score).

In [100]:
# Formato de los prints
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

def headr(text):
    return ('\n'+color.UNDERLINE + text + color.END+'\n')

In [101]:
# Carga de datos
import pandas as pd

titanic = pd.read_csv("../data/titanic/train.csv")
target_column = "Survived"

titanic.shape

(891, 12)


## 1. Explorar

In [None]:
titanic.describe()

In [None]:
titanic.describe(include=object)

In [None]:
titanic.info()

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(titanic, title="Titanic Profiling Report")

In [None]:
profile.to_notebook_iframe()

- Datos desbalanceados
- Datos no normales
- Datos no lineales

## 2. Limpiar


### Eliminar columnas irrelevantes

In [102]:
# Eliminar columnas irrelevantes

discarded_columns = ["PassengerId", "Name", "Ticket", "Cabin"]
titanic_cleaned = titanic.drop(discarded_columns, axis=1)
titanic_cleaned.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [None]:
titanic_cleaned.describe()

In [None]:
titanic_cleaned.describe(include=object)

### Valores faltantes

In [104]:
# Valores faltantes

print(headr("Valores faltantes - original: "), titanic_cleaned.isna().sum())


[4mValores faltantes - original: [0m
 Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64


In [105]:
# Imputar valores faltantes
titanic_cleaned["Age"].fillna(titanic_cleaned["Age"].median(), inplace=True)
titanic_cleaned["Embarked"].fillna(titanic_cleaned["Embarked"].mode()[0], inplace=True)

print(headr("Valores faltantes - imputados: "), titanic_cleaned.isna().sum())


[4mValores faltantes - imputados: [0m
 Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_cleaned["Age"].fillna(titanic_cleaned["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_cleaned["Embarked"].fillna(titanic_cleaned["Embarked"].mode()[0], inplace=True)


### Outliers

In [None]:
# Extraer columnas numéricas y categóricas
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(titanic_cleaned)
categorical_columns = categorical_columns_selector(titanic_cleaned)

print(headr("Numerical columns"), numerical_columns)
print(headr("Categorical columns"), categorical_columns)


[4mNumerical columns[0m
 ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

[4mCategorical columns[0m
 ['Sex', 'Embarked']


In [None]:
# Outliers

# identificación
IQR = titanic_cleaned[numerical_columns].quantile(0.75) - titanic_cleaned[numerical_columns].quantile(0.25)
lower_bound = titanic_cleaned[numerical_columns].quantile(0.25) - (IQR * 3)
upper_bound = titanic_cleaned[numerical_columns].quantile(0.75) + (IQR * 3)

print(headr('lower_bound'),lower_bound)
print(headr('upper_bound'),upper_bound)

outliers = titanic_cleaned[numerical_columns][(titanic_cleaned[numerical_columns] < lower_bound) | (titanic_cleaned[numerical_columns] > upper_bound)]

print(headr("Outliers:"))
outliers.count()


[4mlower_bound[0m
 Survived    -3.0000
Pclass      -1.0000
Age        -17.0000
SibSp       -3.0000
Parch        0.0000
Fare       -61.3584
dtype: float64

[4mupper_bound[0m
 Survived      4.0000
Pclass        6.0000
Age          74.0000
SibSp         4.0000
Parch         0.0000
Fare        100.2688
dtype: float64

[4mOutliers:[0m



Survived      0
Pclass        0
Age           1
SibSp        12
Parch       213
Fare         53
dtype: int64

In [107]:
# reemplazo

# columnas con outliers
columns_with_outliers = outliers.columns[outliers.count() > 0]
columns_with_outliers

Index(['Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [108]:
# Función para winsorizar una columna
def winsorize_column(column, lower_bound, upper_bound):
    return column.clip(lower=lower_bound, upper=upper_bound)

In [109]:
# procesamos todas las comumnas con outliers
for col_name in columns_with_outliers:
    titanic_cleaned[col_name] = winsorize_column(titanic_cleaned[col_name], lower_bound[col_name], upper_bound[col_name])

outliers = titanic_cleaned[numerical_columns][(titanic_cleaned[numerical_columns] < lower_bound) | (titanic_cleaned[numerical_columns] > upper_bound)]

print(headr("Outliers - winsorized:"))
outliers.count()


[4mOutliers - winsorized:[0m



Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [None]:
titanic_cleaned.head()

### Duplicados

In [110]:
# duplicados

duplicates = titanic.duplicated()
print(headr("Duplicadas originales:"),duplicates.sum(),'\n', titanic[duplicates])

duplicates = titanic_cleaned.duplicated()
print(headr("Duplicadas tras limpieza:"),duplicates.sum(),'\n', titanic_cleaned[duplicates])


[4mDuplicadas originales:[0m
 0 
 Empty DataFrame
Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
Index: []

[4mDuplicadas tras limpieza:[0m
 117 
      Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
47          1       3  female  28.0      0      0   7.7500        Q
55          1       1    male  28.0      0      0  35.5000        S
76          0       3    male  28.0      0      0   7.8958        S
77          0       3    male  28.0      0      0   8.0500        S
87          0       3    male  28.0      0      0   8.0500        S
..        ...     ...     ...   ...    ...    ...      ...      ...
870         0       3    male  26.0      0      0   7.8958        S
877         0       3    male  19.0      0      0   7.8958        S
878         0       3    male  28.0      0      0   7.8958        S
884         0       3    male  25.0      0      0   7.0500        S
886         0       2    male  27.0      0   

- Los duplicados son significativos y los dejaremos

### Volvemos a explorar con columnas codificadas

In [None]:
from sklearn.preprocessing import LabelEncoder

titanic_cleaned_coded = titanic_cleaned.copy()
titanic_cleaned_coded[categorical_columns] = titanic_cleaned[categorical_columns].apply(LabelEncoder().fit_transform)   
titanic_cleaned_coded.head()

In [None]:
# Evaluamos la aportación de cada columna
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

X= titanic_cleaned_coded.drop(target_column, axis=1)
y = titanic_cleaned_coded[target_column]

fvalue_selector = SelectKBest(f_classif, k=2)

X_kbest = fvalue_selector.fit(X,y)

feature_scores = pd.DataFrame({"Feature": X.columns,"Score": X_kbest.scores_}).sort_values(by="Score", ascending=False)

print(headr("Feature scores"))
round(feature_scores,2)

In [None]:
# Ver proporciones nuevamente

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


features = X.columns
num_features = len(features)

for i, feature in enumerate(features):
    print(headr(f"Graficando: {feature}"))
    sns.barplot(x=feature, y=target_column, data=titanic_cleaned_coded)
    plt.title(f"Tasa de {target_column} por {feature}")
    plt.xlabel(feature)
    plt.ylabel(target_column)
    plt.show()


## 3. Preprocesar Datos

### Eliminar constantes

In [111]:
# titanic_cleaned=titanic_cleaned.drop('Parch', axis=1)

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0) # seleccionar características con varianza mayor a 0 o el valor que se desee
sel.fit(titanic_cleaned_coded)

no_constant_columns = sel.get_feature_names_out()
constant_columns = titanic_cleaned.columns.drop(no_constant_columns)

print(headr("Columnas constantes:"), constant_columns)

titanic_cleaned = titanic_cleaned.drop(constant_columns, axis=1)
print(headr("Columnas finales:"), titanic_cleaned.columns)



[4mColumnas constantes:[0m
 Index(['Parch'], dtype='object')

[4mColumnas finales:[0m
 Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked'], dtype='object')


### Preparar para entrenamiento

In [None]:
# Separar características y target
X = titanic_cleaned.drop(target_column, axis=1)
y = titanic_cleaned[target_column]

In [None]:
# Dividir en entrenamiento y prueba
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Candidatos

A partir del análisis previo:

1. `LogisticRegression` # No coincide con perfil de datos, para comparar
2. `RandomForestClassifier` 
3. `GradientBoostingClassifier`

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.compose import ColumnTransformer


### Piplines

In [None]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

print(headr("Numerical columns"), numerical_columns)
print(headr("Categorical columns"), categorical_columns)

In [None]:
preprocessor = ColumnTransformer(
    [
        ('numerical', StandardScaler(), numerical_columns),
        ('categorical', OneHotEncoder(), categorical_columns)
    ])

pipelines = {
    "LogisticRegression": Pipeline([('preprocessor', preprocessor),('classifier', LogisticRegression())]),
    "RandomForestClassifier": Pipeline([('preprocessor', preprocessor),('classifier', RandomForestClassifier())]),
    "GradientBoostingClassifier": Pipeline([('preprocessor', preprocessor),('classifier', GradientBoostingClassifier())]),
}

### CV

In [None]:
def cv_train(name, pipeline, cv):
    print(headr(f"Entrenar {name}"))
    cv_results = cross_validate(pipeline, X_train, y_train, cv=cv, scoring="accuracy", return_estimator=True, return_train_score=True)
    trained_model = cv_results["estimator"][0]
    scores = pd.DataFrame(cv_results)

    print("test score (mean-std): {0:.2f} - {1:.2f}".format(scores["test_score"].mean(), scores["test_score"].std()))
    print("train score (mean-std): {0:.2f} - {1:.2f}".format(scores["train_score"].mean(), scores["train_score"].std()))
    print("params:", pipeline.named_steps.get("classifier").get_params())

    y_pred = trained_model.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    return {"acc": round(scores["test_score"].mean(), 2), "f1": round(f1, 2),}

In [None]:
cvss = ShuffleSplit(n_splits=40, test_size=0.2, random_state=0)

results = {}

for name, pipeline in pipelines.items():
    results[name] = cv_train(name, pipeline, cvss)

print(headr("Resultados:"))
results_df=pd.DataFrame(results)
results_df

### Curva de aprenizaje

##

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve, validation_curve

%matplotlib inline

In [None]:
# Curvas de aprendizaje

train_sizes = np.linspace(0.1, 1.0, num=5, endpoint=True)

def generate_learning_curves(name, pipeline, X, y, train_sizes):
    results = learning_curve(pipeline, X, y, train_sizes=train_sizes,
                             cv=cvss, scoring='accuracy')
    
    train_size, train_scores, test_scores = results[:3]

    # graficar la curva.
    plt.errorbar(train_size, train_scores.mean(axis=1),
                 yerr=train_scores.std(axis=1), label="Error de entrenamiento")
    plt.errorbar(train_size, test_scores.mean(axis=1),
                 yerr=test_scores.std(axis=1), label="Error de prueba")
    plt.legend()

    plt.xscale("log")
    plt.xlabel("Número de muestras en el conjunto de entrenamiento")
    plt.ylabel("MSE")
    plt.title("Curva de aprendizaje para {name}".format(name=name))

    plt.show()

In [None]:
for pipeline_name, pipeline_obj in pipelines.items():
    generate_learning_curves(pipeline_name, pipeline_obj, X, y, train_sizes)

### Curva validación

In [None]:
# Curvas de validación

def generate_validation_curves(name, pipeline, X, y, param_name, param_range):
    train_scores, test_scores = validation_curve(
        pipeline, X, y, param_name=param_name, param_range=param_range,
        cv=cvss, scoring="accuracy")

    # graficar la curva.
    plt.plot(param_range, train_scores.mean(
        axis=1), label="Error de entrenamiento")
    plt.plot(param_range, test_scores.mean(axis=1), label="Error de prueba")
    plt.legend()

    plt.xlabel("Valor del ({param_name})".format(
        param_name=param_name))
    plt.ylabel("Accuracy")
    plt.title("Curva de validación para {name}".format(name=name))

    plt.show()


In [None]:
pname = 'LogisticRegression'
Cs = [0,0.1, 1, 5, 10, 12]
generate_validation_curves(pname, pipelines[pname], X, y, 'classifier__C', Cs)

In [None]:
pname = 'RandomForestClassifier'
n_estimators_ops = [1,2,3,4,5]
generate_validation_curves(pname, pipelines[pname], X, y, 'classifier__n_estimators', n_estimators_ops)

In [None]:
pname = 'GradientBoostingClassifier'
n_estimators_ops = [1,2,3,4,5,10]
generate_validation_curves(pname, pipelines[pname], X, y, 'classifier__n_estimators', n_estimators_ops)

### Afinar hiperparámentros

In [None]:
from sklearn.model_selection import GridSearchCV

pname = 'GradientBoostingClassifier'

param_grid = {
    'classifier__n_estimators': [1, 2, 3, 4, 5, 10],
    'classifier__max_depth': [1, 2, 3, 4, 5],
    'classifier__learning_rate': [0.1, 0.01, 0.001]
}

grid_search = GridSearchCV(
    pipelines[pname], param_grid, cv=cvss, scoring="accuracy")

grid_search.fit(X_train, y_train)

print(headr(pname))
print("Mejores hiperparámetros:", grid_search.best_params_)
print(f"Mejor accuracy: {grid_search.best_score_:.2f}")

### *Afinar hiperparámetros otros modelos*

### re-entrenar + re-evaluar

In [None]:
preprocessor = ColumnTransformer(
    [
        ('numerical', StandardScaler(), numerical_columns),
        ('categorical', OneHotEncoder(), categorical_columns)
    ])

pipelines = {
    "LogisticRegression": Pipeline([('preprocessor', preprocessor),('classifier', LogisticRegression())]),
    "RandomForestClassifier": Pipeline([('preprocessor', preprocessor),('classifier', RandomForestClassifier())]),
    "GradientBoostingClassifier": Pipeline([('preprocessor', preprocessor),('classifier', GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=10))]),
}

In [None]:
results_final = {}

for name, pipeline in pipelines.items():
    results_final[name] = cv_train(name, pipeline, cvss)

## 4. Comparar resultados

In [None]:

print(headr("Resultados:"))
results_df = pd.DataFrame(results_final)
results_df

## 5. Resultado

GradientBoostingClassifier

## 6. Mejorar el resultado

- Añadiendo más características: por ejemplo, procesando "Cabin" para generar una categoría
- Usando otro tipo de codificación: por ejemplo, objetivo/codificación de media (target encoding)