In [None]:
import pandas as pd
import numpy as np

import configparser

In [None]:
dataset = pd.read_csv('../data/raw/train.csv')
dataset.head()

In [47]:
config = configparser.ConfigParser()
config.read('../pipeline.cfg')
#valor = config.get('GENERAL', 'TARGET')

['../pipeline.cfg']

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.encoding import CountFrequencyEncoder

In [65]:
drop_vars = list(config.get('GENERAL', 'VARS_TO_DROP').split(', '))
drop_vars.append(config.get('GENERAL', 'TARGET'))

x_features = dataset.drop(labels=drop_vars, axis=1)
y_target = dataset[config.get('GENERAL', 'TARGET')]
x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.3, shuffle=True, random_state=2025)

In [66]:
titanic_survived_model = Pipeline([
    # imputación de variables continuas.
    ('continues_var_imputation', MeanMedianImputer(imputation_method='mean', variables=config.get('CONTINUES', 'VARS_TO_IMPUTE').split(', ') )),

    # imputacuón de variables categóricas
    ('categorical_var_imputation', CategoricalImputer(imputation_method='frequent', variables=config.get('CATEGORICAL', 'VARS_TO_IMPUTE'))),

    # codificación de variables categoricas
    ('categorical_encode_ohe', OneHotEncoder(drop_last=True, variables=config.get('CATEGORICAL', 'OHE_VARS'))),
    ('categorical_encode_frequency', CountFrequencyEncoder(encoding_method='count', variables=config.get('CATEGORICAL', 'FREQUENCY_ENC_VARS'))),

    # feature scaling
    ('fueature_scaling', StandardScaler())
])

In [67]:
# corremos pipeline
titanic_survived_model.fit(x_train)

In [91]:
x_features_processed = titanic_survived_model.transform(x_train)
df_features_processed = pd.DataFrame(x_features_processed, columns=x_train.columns)
df_features_processed['Survived'] = y_train.reset_index()['Survived'] 

df_features_processed.to_csv('../data/processed/features_for_models.csv', index=False)

In [88]:
x_features_processed_test = titanic_survived_model.transform(x_test)
df_features_processed_test = pd.DataFrame(x_features_processed_test, columns=x_test.columns)
df_features_processed_test['Survived'] = y_test.reset_index()['Survived']

df_features_processed_test.to_csv('../data/processed/test_dataset.csv', index=False)

In [89]:
import pickle

with open('../artifacts/pipeline.pkl', 'wb') as f:
    pickle.dump(titanic_survived_model, f)