# Tech Challenge FIAP - Etapa 6: Validação Estatística

In [None]:
# Importação das bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

plt.style.use('seaborn-v0_8')
np.random.seed(42)

print("Bibliotecas importadas com sucesso!")

Bibliotecas importadas com sucesso!


## 1. Carregamento do Dataset

Vamos carregar o dataset pré-processado que já passou pelas etapas de limpeza, transformação e feature engineering.

In [2]:
dataset_path = r"C:\Users\prado\.cache\kagglehub\datasets\awaiskaggler\insurance-csv\versions\1\insurance_preprocessed.csv"

try:
    df = pd.read_csv(dataset_path)
    print("Dataset carregado com sucesso!")
    print(f"Colunas: {list(df.columns)}")
    print("\nPrimeiras 5 linhas do dataset:")
    display(df.head())
    print("\nInformações do dataset:")
    display(df.info())
except FileNotFoundError:
    print("Arquivo não encontrado. Verifique o caminho do dataset.")
    print("Caminho esperado:", dataset_path)

Dataset carregado com sucesso!
Colunas: ['age', 'bmi', 'children', 'expenses', 'log_expenses', 'smoker_encoded', 'sex_male', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest', 'age_bmi_interaction', 'age_squared', 'bmi_normal', 'bmi_obese', 'bmi_overweight', 'bmi_underweight', 'high_risk']

Primeiras 5 linhas do dataset:


Unnamed: 0,age,bmi,children,expenses,log_expenses,smoker_encoded,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,age_bmi_interaction,age_squared,bmi_normal,bmi_obese,bmi_overweight,bmi_underweight,high_risk
0,-1.440418,-0.453484,-0.909234,16884.92,9.734235,1,0,False,False,False,True,-1.305601,-1.221599,False,False,True,False,0
1,-1.511647,0.513986,-0.079442,1725.55,7.45388,0,1,False,False,True,False,-1.155713,-1.254484,False,True,False,False,0
2,-0.79935,0.382803,1.580143,4449.46,8.400763,0,1,False,False,True,False,-0.551569,-0.845647,False,True,False,False,0
3,-0.443201,-1.306169,-0.909234,21984.47,9.998137,0,1,False,True,False,False,-0.886375,-0.57457,True,False,False,False,0
4,-0.514431,-0.289506,-0.909234,3866.86,8.260457,0,1,False,True,False,False,-0.550037,-0.63234,False,False,True,False,0



Informações do dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1337 non-null   float64
 1   bmi                  1337 non-null   float64
 2   children             1337 non-null   float64
 3   expenses             1337 non-null   float64
 4   log_expenses         1337 non-null   float64
 5   smoker_encoded       1337 non-null   int64  
 6   sex_male             1337 non-null   int64  
 7   region_northeast     1337 non-null   bool   
 8   region_northwest     1337 non-null   bool   
 9   region_southeast     1337 non-null   bool   
 10  region_southwest     1337 non-null   bool   
 11  age_bmi_interaction  1337 non-null   float64
 12  age_squared          1337 non-null   float64
 13  bmi_normal           1337 non-null   bool   
 14  bmi_obese            1337 non-null   bool   
 15  bmi_overweigh

None

## 2. Pipeline de Modelagem para Múltiplos Targets

Vamos definir funções para executar todo o pipeline de modelagem para cada target, garantindo que não haja vazamento de informação entre features e target.

In [3]:
# Função para preparar dados, treinar modelos, realizar validação cruzada e avaliar métricas
def run_regression_pipeline(df, target_column, drop_columns, model_names=None):
    """
    Executa o pipeline de regressão para um target específico.
    - df: DataFrame de entrada
    - target_column: nome da coluna target
    - drop_columns: lista de colunas a remover das features (inclui o outro target)
    - model_names: nomes dos modelos para exibição
    Retorna: dicionário com resultados, previsões, resíduos, etc.
    """
    # Corrigido: garantir que a coluna target não está nas features
    X = df.drop(columns=drop_columns + [target_column] if target_column not in drop_columns else drop_columns)
    y = df[target_column]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    }
    if model_names:
        models = dict(zip(model_names, models.values()))

    predictions = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions[name] = model.predict(X_test)

    cv_folds = 5
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_results = {}
    for name, model in models.items():
        cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2')
        cv_rmse = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_root_mean_squared_error')
        cv_results[name] = {
            'R2_scores': cv_scores,
            'R2_mean': cv_scores.mean(),
            'R2_std': cv_scores.std(),
            'RMSE_scores': cv_rmse,
            'RMSE_mean': cv_rmse.mean(),
            'RMSE_std': cv_rmse.std()
        }

    def calculate_metrics(y_true, y_pred):
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true, y_pred)
        return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R²': r2}

    results = []
    for name in models.keys():
        metrics = calculate_metrics(y_test, predictions[name])
        metrics['Modelo'] = name
        results.append(metrics)
    results_df = pd.DataFrame(results)

    best_model_idx = results_df['R²'].idxmax()
    best_model_name = results_df.loc[best_model_idx, 'Modelo']
    best_predictions = predictions[best_model_name]
    residuals = y_test - best_predictions

    return {
        'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test,
        'models': models, 'predictions': predictions, 'cv_results': cv_results,
        'results_df': results_df, 'best_model_name': best_model_name,
        'best_predictions': best_predictions, 'residuals': residuals,
        'cv_summary': pd.DataFrame({
            'Modelo': list(cv_results.keys()),
            'R² Médio': [cv_results[name]['R2_mean'] for name in cv_results.keys()],
            'R² Desvio': [cv_results[name]['R2_std'] for name in cv_results.keys()],
            'RMSE Médio': [cv_results[name]['RMSE_mean'] for name in cv_results.keys()],
            'RMSE Desvio': [cv_results[name]['RMSE_std'] for name in cv_results.keys()]
        })
    }

## 3. Modelagem para ambos os targets

Executaremos o pipeline para:
- Target 1: **expenses** (valor original)
- Target 2: **log_expenses** (transformação logarítmica)

Em cada caso, a coluna do outro target é removida das features.

In [None]:
# expenses (target original)
print("=== Pipeline para target: expenses ===")
pipeline_expenses = run_regression_pipeline(
    df,
    target_column='expenses',
    drop_columns=['log_expenses'],
    model_names=None
)
print("Concluído para expenses.\n")

# log_expenses (target transformado)
print("=== Pipeline para target: log_expenses ===")
pipeline_log = run_regression_pipeline(
    df,
    target_column='log_expenses',
    drop_columns=['expenses'], 
    model_names=None
)
print("Concluído para log_expenses.")

=== Pipeline para target: expenses ===
Concluído para expenses.

=== Pipeline para target: log_expenses ===
Concluído para expenses.

=== Pipeline para target: log_expenses ===
Concluído para log_expenses.
Concluído para log_expenses.
