In [1]:
# EDA
import pandas as pd
import plotly.express as px
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

### Carregar os dados

In [2]:
# Carregar os dados já tratados
df_costs = pd.read_csv('../datasets/healthcosts_cleaned.csv')

In [3]:
# Mostrar as primeiras linhas
df_costs.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552


In [4]:
# Mostrar as últimas linhas
df_costs.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
1333,50,male,30.97,3,0,northwest,10600.5483
1334,18,female,31.92,0,0,northeast,2205.9808
1335,18,female,36.85,0,0,southeast,1629.8335
1336,21,female,25.8,0,0,southwest,2007.945
1337,61,female,29.07,0,1,northwest,29141.3603


In [5]:
# Mostrar a estrutura
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


### Preparação dos dados

In [6]:
# Preparar dados - Separar X e y
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [7]:
# Carregar o preprocessor
import joblib
preprocessor = joblib.load('../preprocessor_dataset_healthcosts.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
# Dividir o dataset entre treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [9]:
# Aplicar o preprocessor
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
# Mostrar os conjuntos
print(f'Dados de Treinamento: {X_train.shape}')
print(f'Dados de Teste: {X_test.shape}')

Dados de Treinamento: (1070, 10)
Dados de Teste: (268, 10)


### Treinamento do Modelo

In [11]:
# Criar o objeto de VotingRegressor

lr_model = LinearRegression()
elastic_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=51)
tree_model = DecisionTreeRegressor(random_state=51)

voting_model = VotingRegressor(
  estimators=[
    ('linear regression', lr_model),
    ('elastic', elastic_model),
    ('decision tree', tree_model)
  ]
)

In [12]:
# Treinar o modelo
voting_model.fit(X_train, y_train)

### Análise dos Resultados

In [13]:
# Realizar predição
y_pred = voting_model.predict(X_test)

In [14]:
# Mostrar y_pred
y_pred

array([ 9217.84324756, 37008.214315  , 11374.79944306, 16485.61643585,
       33523.57959374, 10906.60829277, 10064.0759939 , 13477.43213162,
        6694.5196024 , 10752.12413053,  9771.71722238, 11907.44614343,
        9129.69381172,  4375.84816398,  6123.13714659, 12766.66608398,
        5319.70147364,  6695.69784269, 21274.99480695, 23775.14628505,
        8613.78215304,  8299.29664822, 39009.59959457, 12426.10054886,
        6734.49655255, 15296.52943612, 11179.1244833 ,  3443.09667158,
       25436.86646569, 13841.41995046,  4176.2838956 , 25488.93614715,
        5082.43165229,  4973.21922747,  8101.31767983, 17187.85388374,
       11101.76416801,  3225.96291261, 11995.47851548,  8504.09366477,
       10921.75504986,  2528.38099281,  5951.93808847,  3462.8632558 ,
        8573.32174675, 13874.14421414, 13674.59485872, 34620.28987138,
        9023.51974679, 12615.034391  ,  6281.90485212, 31672.17680155,
        8470.61413631, 39976.86061535, 11497.03043224, 28412.58005401,
      

In [15]:
# Avaliar métricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [16]:
# Mostrar o Erro e R2 do Modelo
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

RMSE: 6240.904728783782
R2: 0.7760046197139258


In [18]:
# Calcular a importância das variáveis

importances = []

for estimador in voting_model.estimators_:
  # Se é um modelo linear, obtenha coef
  if hasattr(estimador, 'coef_'):
    importances.append(np.abs(estimador.coef_))
  # Se é um modelo de árvore, obtenha feature_importance
  elif hasattr(estimador, 'feature_importances_'):
    importances.append(estimador.feature_importances_)
  else:
    print(f'Não foi possível carregar importância do modelo {type(estimador).__name__}')

In [21]:
importances[2].shape

(10,)

In [22]:
# Calcular a média das importâncias
importancia_media = np.mean(importances, axis=0)

In [23]:
# Normalizar as importâncias
feature_importance = importancia_media / np.sum(importancia_media)

In [24]:
# Obter os nomes das features
feature_names = preprocessor.get_feature_names_out()

In [26]:
# Criar um Dataframe com nomes e imporância das features
importance_df = pd.DataFrame({
  'feature': feature_names,
  'importance': feature_importance
})

In [27]:
# Ordenar o Dataframe pela importância
importance_df = importance_df.sort_values(by='importance', ascending=True)

In [29]:
# Criar um gráfico para mostrar a importância
fig = px.bar(
  importance_df,
  x='importance',
  y='feature',
  title='Importância das features - Voting Regressor',
  orientation='h',
)

fig.show()

In [31]:
# Mostrar evidência do Hard Voting (Média aritmética dos valores dos estimadores)

# Selecionar um registro para predição
X_sample = X_test[7].reshape(1, -1)

In [32]:
# Predições individuais dos estimadores
linear_pred = voting_model.named_estimators_['linear regression'].predict(X_sample)
elastic_pred = voting_model.named_estimators_['elastic'].predict(X_sample)
tree_pred = voting_model.named_estimators_['decision tree'].predict(X_sample)

In [33]:
# Predição final do Voting
voting_pred = voting_model.predict(X_sample)

In [34]:
# Média das predições dos estimadores
mean_pred = np.mean([linear_pred[0], elastic_pred[0], tree_pred[0]])

In [35]:
# Exibir os resultados
print(f'Predição da Regressão Linear: {linear_pred[0]}')
print(f'Predição do ElasticNet: {elastic_pred[0]}')
print(f'Predição da Árvore de Decisão: {tree_pred[0]}')

print(f'Média das Predições: {mean_pred}')
print(f'Predição final do Hard Voting: {voting_pred[0]}')

Predição da Regressão Linear: 14793.194288532937
Predição do ElasticNet: 13782.690606341423
Predição da Árvore de Decisão: 11856.4115
Média das Predições: 13477.432131624788
Predição final do Hard Voting: 13477.432131624788
