In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [None]:
dataset = pd.read_csv("../data/raw/train.csv")
dataset.head()

In [3]:
# configuración del Pipeline
TARGET = 'Survived'
VARS_TO_DROP = ['PassengerId', 'Cabin', 'Name', 'Ticket', TARGET]
CONTINUE_VARS_TO_IMPUTATION = ['Age', 'Fare']
CATEGORICAL_VARS_TO_IMPUTATION = ['Embarked', 'Sex']
OHE_VAR_TO_ENCODE = ['Sex']
FREQENC_VARS_TO_ENCODE = ['Embarked']

In [4]:
x_features = dataset.drop(labels=VARS_TO_DROP, axis=1)
y_target = dataset[TARGET]
x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.3, shuffle=True, random_state=2025)

In [5]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.encoding import CountFrequencyEncoder 

from sklearn.preprocessing import StandardScaler

In [None]:
titanic_survived_predict_model = Pipeline([
    # imputación de variabels continuas.
    ('continues_var_mean_imputation', MeanMedianImputer(imputation_method='mean', variables=CONTINUE_VARS_TO_IMPUTATION)),

    # imputación de variabels categoricas
    ('categorical_var_freq_imputation', CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_TO_IMPUTATION)),

    #Codificación de variabales categóricas 
    ('categorical_encoding_ohe', OneHotEncoder(variables=OHE_VAR_TO_ENCODE, drop_last=True)),
    ('caregorical_encoding_freq_enc', CountFrequencyEncoder(encoding_method='count', variables=FREQENC_VARS_TO_ENCODE)),

    # estandarización de variables.
    ('feature_scaling', StandardScaler())
])

In [7]:
# configuramos pipeline
titanic_survived_predict_model.fit(x_train)

In [None]:
x_fatures_processed = titanic_survived_predict_model.transform(x_train)
df_fatures_process = pd.DataFrame(x_fatures_processed, columns=x_train.columns)
df_fatures_process[TARGET] = y_train.reset_index()['Survived']

# guardamos los datos para entrenar los modelos.
df_fatures_process.to_csv('../data/processed/features_for_model.csv', index=False)
df_fatures_process.head()

In [17]:
import pickle

x_test[TARGET] = y_test
x_test.to_csv('../data/processed/test_dataset.csv', index=False)

with open('../artifacts/pipeline.pkl', 'wb') as f:
    pickle.dump(titanic_survived_predict_model, f)

In [16]:
y_test

622    1
25     1
307    1
783    0
834    0
      ..
683    0
562    0
796    1
479    1
150    0
Name: Survived, Length: 268, dtype: int64

In [11]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
622,3,male,20.0,1,1,15.7417,C
25,3,female,38.0,1,5,31.3875,S
307,1,female,17.0,1,0,108.9000,C
783,3,male,,1,2,23.4500,S
834,3,male,18.0,0,0,8.3000,S
...,...,...,...,...,...,...,...
683,3,male,14.0,5,2,46.9000,S
562,2,male,28.0,0,0,13.5000,S
796,1,female,49.0,0,0,25.9292,S
479,3,female,2.0,0,1,12.2875,S
