# Data Preparation

O objetivo aqui é preparar os dados para o treinamento criando um pipeline fitado para o conjunto de treinamento que seja usado no conjunto de teste evitando vazamento de informaçoes de teste para o treinamento

#Imports

In [0]:
import pandas as pd
import numpy as np
from unidecode import unidecode
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

import joblib

#Funçoes

In [0]:
def create_imputer(col):
    strategy = IMPUTATION[col]
    if isinstance(strategy, tuple):
        return SimpleImputer(strategy=strategy[0], fill_value=strategy[1])
    else:
        return SimpleImputer(strategy=strategy)
    
def prepare_data(X):
    X = X.copy()
    for col in ['fumante', 'regiao', 'facebook', 'classe']:
        if col in X.columns:
            # Converter para float antes de categoria
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('category')
    return X


def is_binary(series):
    return series.nunique() == 2

In [0]:
class SafeColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, transformer, columns):
        self.transformer = transformer
        self.columns = columns
        self.columns_present_fit_ = []

    def fit(self, X, y=None):
        self.columns_present_fit_ = [col for col in self.columns if col in X.columns]
        if self.columns_present_fit_:
            self.transformer.fit(X[self.columns_present_fit_], y)
        return self

    def transform(self, X):
        # Garante que as colunas que existiam no fit estejam presentes
        cols = self.columns_present_fit_
        missing_cols = [col for col in cols if col not in X.columns]
        
        # Cria DataFrame com as colunas faltantes preenchidas com NaN
        if missing_cols:
            for col in missing_cols:
                X[col] = np.nan
        
        # Ordena colunas conforme o fit
        X_ordered = X[cols]

        return self.transformer.transform(X_ordered)

    def get_feature_names_out(self, input_features=None):
        try:
            return self.transformer.get_feature_names_out()
        except AttributeError:
            return np.array(self.columns_present_fit_)


#Lendo dados de treino

In [0]:
df = pd.read_excel('data/Seguro Saúde - Modelagem.xlsx', sheet_name='MODELAGEM')

# Função que deixa colunas minúsculas
def normalize_columns(df):
    df.columns = [unidecode(col).lower() for col in df.columns]
    return df

# Função para normalizar os valores das colunas de texto
def normalize_text_values(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(lambda x: unidecode(x).lower() if isinstance(x, str) else x)
    return df

df = normalize_columns(df)
df = normalize_text_values(df)

x_train = df.drop(columns=['nascimento', 'valor'])
y_train = df[['matricula','valor']]

display(x_train.head())
display(y_train.head())

#Criando Pipeline de data prep

In [0]:
NUMERIC_COLS = ['idade', 'imc', 'filhos']
CATEGORICAL_COLS = ['sexo', 'signo', 'fumante', 'regiao', 'facebook', 'classe']

binary_cols = [col for col in CATEGORICAL_COLS if is_binary(x_train[col])]
non_binary_cats = list(set(CATEGORICAL_COLS) - set(binary_cols))

In [0]:
preprocessor = FeatureUnion(transformer_list=[
    ('numeric', SafeColumnTransformer(
        Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), NUMERIC_COLS)),
    
    ('binary_cat', SafeColumnTransformer(
        Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(drop='first', sparse_output=False))
        ]), binary_cols)),

    ('nonbinary_cat', SafeColumnTransformer(
        Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(sparse_output=False))
        ]), non_binary_cats)),
])

In [0]:
x_train_prepared = prepare_data(x_train)
x_train_transformed = preprocessor.fit_transform(x_train_prepared)

feature_names = preprocessor.get_feature_names_out()
x_train_df = pd.DataFrame(x_train_transformed, columns=feature_names)
display(x_train_df.head())

Os dados agora esta preparados para serem processados por qualquer tipo de algoritmo de machine learning.

Agora queremos salvar este pipeline para que seja utilizado posteriormente nos dados de teste

#Salvando o pipeline e dados preparados

In [0]:
prepare_transformer = FunctionTransformer(prepare_data)

full_pipeline = Pipeline(steps=[
    ('prepare', prepare_transformer),
    ('preprocess', preprocessor)
])

In [0]:
full_pipeline.fit_transform(x_train)

feature_names = full_pipeline.named_steps['preprocess'].get_feature_names_out()
x_train_df = pd.DataFrame(x_train_transformed, columns=feature_names, index=x_train.index)
display(x_train_df.head())

In [0]:
x_train_df.shape

In [0]:
x_train_df = x_train_df.assign(matricula=x_train['matricula']).set_index('matricula').reset_index()
display(x_train_df.head())
x_train_df.to_parquet('data/x_train_prepared.parquet')

In [0]:
full_pipeline.fit(x_train)

In [0]:
joblib.dump(full_pipeline, 'artifacts/pipeline_completo.pkl')