In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [None]:
dataset = pd.read_csv("../data/raw/train.csv")
dataset.head()

In [3]:
# configuración del Pipeline
TARGET = 'Survived'
VARS_TO_DROP = ['PassengerId', 'Cabin', 'Name', 'Ticket', TARGET]
CONTINUE_VARS_TO_IMPUTATION = ['Age', 'Fare']
CATEGORICAL_VARS_TO_IMPUTATION = ['Embarked', 'Sex']
OHE_VAR_TO_ENCODE = ['Sex']
FREQENC_VARS_TO_ENCODE = ['Embarked']

In [4]:
x_features = dataset.drop(labels=VARS_TO_DROP, axis=1)
y_target = dataset[TARGET]
x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.3, shuffle=True, random_state=2025)

In [5]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.encoding import CountFrequencyEncoder 

from sklearn.preprocessing import StandardScaler

In [6]:
titanic_survived_predict_model = Pipeline([
    # imputación de variabels continuas.
    ('continues_var_mean_imputation', MeanMedianImputer(imputation_method='mean', variables=CONTINUE_VARS_TO_IMPUTATION)),

    # imputación de variabels categoricas
    ('categorical_var_freq_imputation', CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_TO_IMPUTATION)),

    #Codificación de variabales categóricas 
    ('categorical_encoding_ohe', OneHotEncoder(variables=OHE_VAR_TO_ENCODE, drop_last=True)),
    ('caregorical_encoding_freq_enc', CountFrequencyEncoder(encoding_method='count', variables=FREQENC_VARS_TO_ENCODE)),

    # estandarización de variables.
    ('feature_scaling', StandardScaler())
])

In [7]:
# configuramos pipeline
titanic_survived_predict_model.fit(x_train)

In [8]:
x_fatures_processed = titanic_survived_predict_model.transform(x_train)
df_fatures_process = pd.DataFrame(x_fatures_processed, columns=x_train.columns)
df_fatures_process[TARGET] = y_train
df_fatures_process[TARGET] = df_fatures_process[TARGET].fillna(0).astype(int)

df_fatures_process.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,-0.393652,-0.3466664,-0.474492,-0.485462,-0.38617,0.609696,0.767649,0
1,0.814452,2.768752e-16,-0.474492,-0.485462,-0.493229,-1.90115,0.767649,0
2,0.814452,-0.9701339,2.810045,1.934079,-0.489252,0.609696,-1.302678,1
3,0.814452,-0.7363336,-0.474492,-0.485462,-0.568721,-1.499415,0.767649,1
4,-1.601756,1.445803,0.346642,-0.485462,0.908274,-1.499415,0.767649,0


In [9]:
import pickle

# guardamos los datos para entrenar los modelos.
df_fatures_process.to_csv('../data/processed/features_for_model.csv', index=False)
x_test.to_csv('../data/processed/features_for_models.csv', index=False)

with open('../artifacts/pipeline.pkl', 'wb') as f:
    pickle.dump(titanic_survived_predict_model, f)