# **Ejercicios de selección de candidatos**

## 2. Clasificación 
Para el dataset del titanic (data/titanic)
- Selecciona los 3 mejores modelos candidatos para este problema de clasificación basándote en su rendimiento (usando las métricas de precisión y F1-score).

In [None]:
# Formato de los prints
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

def headr(text):
    return ('\n'+color.UNDERLINE + text + color.END+'\n')

In [None]:
# Carga de datos
import pandas as pd

titanic = pd.read_csv("../../data/titanic/train.csv")
target_column = "Survived"

titanic.shape


## 1. Explorar

In [None]:
titanic.describe()

In [None]:
titanic.describe(include=object)

In [None]:
titanic.info()

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(titanic, title="Titanic Profiling Report")

In [None]:
profile.to_notebook_iframe()

- Datos desbalanceados
- Datos no normales
- Datos no lineales

## 2. Limpiar


In [None]:
# Eliminar columnas irrelevantes

discarded_columns = ["PassengerId", "Name", "Ticket", "Cabin"]
titanic_cleaned = titanic.drop(discarded_columns, axis=1)
titanic_cleaned.head()

In [None]:
titanic_cleaned.describe()

In [None]:
titanic_cleaned.describe(include=object)

In [None]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(titanic_cleaned)
categorical_columns = categorical_columns_selector(titanic_cleaned)

print(headr("Numerical columns"), numerical_columns)
print(headr("Categorical columns"), categorical_columns)

In [None]:
# Valores faltantes

print(headr("Valores faltantes - original: "), titanic_cleaned.isna().sum())

In [None]:
# Imputar valores faltantes
titanic_cleaned["Age"].fillna(titanic_cleaned["Age"].median(), inplace=True)
titanic_cleaned["Embarked"].fillna(titanic_cleaned["Embarked"].mode()[0], inplace=True)

print(headr("Valores faltantes - imputados: "), titanic_cleaned.isna().sum())

In [None]:
# Outliers

# identificación
IQR = titanic_cleaned[numerical_columns].quantile(0.75) - titanic_cleaned[numerical_columns].quantile(0.25)
lower_bound = titanic_cleaned[numerical_columns].quantile(0.25) - (IQR * 3)
upper_bound = titanic_cleaned[numerical_columns].quantile(0.75) + (IQR * 3)

print(headr('lower_bound'),lower_bound)
print(headr('upper_bound'),upper_bound)

outliers = titanic_cleaned[numerical_columns][(titanic_cleaned[numerical_columns] < lower_bound) | (titanic_cleaned[numerical_columns] > upper_bound)]

print(headr("Outliers:"))
outliers.count()

In [None]:
# reemplazo

# columnas con outliers
columns_with_outliers = outliers.columns[outliers.count() > 0]
columns_with_outliers

In [None]:
# Función para winsorizar una columna
def winsorize_column(column, lower_bound, upper_bound):
    return column.clip(lower=lower_bound, upper=upper_bound)

In [None]:
# procesamos todas las comumnas con outliers
for col_name in columns_with_outliers:
    titanic_cleaned[col_name] = winsorize_column(titanic_cleaned[col_name], lower_bound[col_name], upper_bound[col_name])

outliers = titanic_cleaned[numerical_columns][(titanic_cleaned[numerical_columns] < lower_bound) | (titanic_cleaned[numerical_columns] > upper_bound)]

print(headr("Outliers - winsorized:"))
outliers.count()

In [None]:
titanic_cleaned

## 3. Preprocesar Datos

In [None]:
# Separar características y target
X = titanic_cleaned.drop(target_column, axis=1)
y = titanic_cleaned[target_column]

In [None]:
# Dividir en entrenamiento y prueba
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Candidatos

A partir del análisis previo:

1. `LogisticRegression`
2. `RandomForestClassifier` 
3. `GradientBoostingClassifier`

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.compose import ColumnTransformer


### Piplines

In [None]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

print(headr("Numerical columns"), numerical_columns)
print(headr("Categorical columns"), categorical_columns)

In [None]:
preprocessor = ColumnTransformer(
    [
        ('numerical', StandardScaler(), numerical_columns),
        ('categorical', OneHotEncoder(), categorical_columns)
    ])

pipelines = {
    "LogisticRegression": Pipeline([('preprocessor', preprocessor),('classifier', LogisticRegression())]),
    "RandomForestClassifier": Pipeline([('preprocessor', preprocessor),('classifier', RandomForestClassifier())]),
    "GradientBoostingClassifier": Pipeline([('preprocessor', preprocessor),('classifier', GradientBoostingClassifier())]),
}

### CV

In [None]:
def cv_train(name, pipeline, cv):
    print(headr(f"Entrenar {name}"))
    cv_results = cross_validate(pipeline, X_train, y_train, cv=cv, scoring="accuracy", return_estimator=True, return_train_score=True)
    trained_model = cv_results["estimator"][0]
    scores = pd.DataFrame(cv_results)

    print("test score (mean-std): {0:.2f} - {1:.2f}".format(scores["test_score"].mean(), scores["test_score"].std()))
    print("train score (mean-std): {0:.2f} - {1:.2f}".format(scores["train_score"].mean(), scores["train_score"].std()))
    print("params:", pipeline.named_steps.get("classifier").get_params())

    y_pred = trained_model.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    return {"acc": round(scores["test_score"].mean(), 2), "f1": round(f1, 2),}

In [None]:
cvss = ShuffleSplit(n_splits=40, test_size=0.2, random_state=0)

results = {}

for name, pipeline in pipelines.items():
    results[name] = cv_train(name, pipeline, cvss)

print(headr("Resultados:"))
results_df=pd.DataFrame(results)
results_df

### Curva de aprenizaje

##

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve, validation_curve

%matplotlib inline

In [None]:
# Curvas de aprendizaje

train_sizes = np.linspace(0.1, 1.0, num=5, endpoint=True)

def generate_learning_curves(name, pipeline, X, y, train_sizes):
    results = learning_curve(pipeline, X, y, train_sizes=train_sizes,
                             cv=cvss, scoring='accuracy')
    
    train_size, train_scores, test_scores = results[:3]

    # graficar la curva.
    plt.errorbar(train_size, train_scores.mean(axis=1),
                 yerr=train_scores.std(axis=1), label="Error de entrenamiento")
    plt.errorbar(train_size, test_scores.mean(axis=1),
                 yerr=test_scores.std(axis=1), label="Error de prueba")
    plt.legend()

    plt.xscale("log")
    plt.xlabel("Número de muestras en el conjunto de entrenamiento")
    plt.ylabel("MSE")
    plt.title("Curva de aprendizaje para {name}".format(name=name))

    plt.show()

In [None]:
for pipeline_name, pipeline_obj in pipelines.items():
    generate_learning_curves(pipeline_name, pipeline_obj, X, y, train_sizes)

### Curva validación

In [None]:
# Curvas de validación

def generate_validation_curves(name, pipeline, X, y, param_name, param_range):
    train_scores, test_scores = validation_curve(
        pipeline, X, y, param_name=param_name, param_range=param_range,
        cv=cvss, scoring="accuracy")

    # graficar la curva.
    plt.plot(param_range, train_scores.mean(
        axis=1), label="Error de entrenamiento")
    plt.plot(param_range, test_scores.mean(axis=1), label="Error de prueba")
    plt.legend()

    plt.xlabel("Valor del ({param_name})".format(
        param_name=param_name))
    plt.ylabel("Accuracy")
    plt.title("Curva de validación para {name}".format(name=name))

    plt.show()


In [None]:
pname = 'LogisticRegression'
Cs = [0,0.1, 1, 5, 10, 12]
generate_validation_curves(pname, pipelines[pname], X, y, 'classifier__C', Cs)

In [None]:
pname = 'RandomForestClassifier'
n_estimators_ops = [1,2,3,4,5]
generate_validation_curves(pname, pipelines[pname], X, y, 'classifier__n_estimators', n_estimators_ops)

In [None]:
pname = 'GradientBoostingClassifier'
n_estimators_ops = [1,2,3,4,5,10]
generate_validation_curves(pname, pipelines[pname], X, y, 'classifier__n_estimators', n_estimators_ops)

### Afinar hiperparámentros

In [None]:
from sklearn.model_selection import GridSearchCV

pname = 'GradientBoostingClassifier'

param_grid = {
    'classifier__n_estimators': [1, 2, 3, 4, 5, 10],
    'classifier__max_depth': [1, 2, 3, 4, 5],
    'classifier__learning_rate': [0.1, 0.01, 0.001]
}

grid_search = GridSearchCV(
    pipelines[pname], param_grid, cv=cvss, scoring="accuracy")

grid_search.fit(X_train, y_train)

print(headr(pname))
print("Mejores hiperparámetros:", grid_search.best_params_)
print(f"Mejor accuracy: {grid_search.best_score_:.2f}")

### *Afinar hiperparámetros otros modelos*

### re-entrenar + re-evaluar

In [None]:
preprocessor = ColumnTransformer(
    [
        ('numerical', StandardScaler(), numerical_columns),
        ('categorical', OneHotEncoder(), categorical_columns)
    ])

pipelines = {
    "LogisticRegression": Pipeline([('preprocessor', preprocessor),('classifier', LogisticRegression())]),
    "RandomForestClassifier": Pipeline([('preprocessor', preprocessor),('classifier', RandomForestClassifier())]),
    "GradientBoostingClassifier": Pipeline([('preprocessor', preprocessor),('classifier', GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=10))]),
}

In [None]:
results_final = {}

for name, pipeline in pipelines.items():
    results_final[name] = cv_train(name, pipeline, cvss)

## 4. Comparar resultados

In [None]:

print(headr("Resultados:"))
results_df = pd.DataFrame(results_final)
results_df

## 5. Resultado

GradientBoostingClassifier

## 6. Mejorar el resultado

Añadiendo más características: por ejemplo, procesando "Cabin" para generar una categoría