In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [2]:
dataset = pd.read_csv("insurance.csv")
dataset.head()

Unnamed: 0,idade,gênero,imc,filhos,fumante,região,encargos
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
dataset.shape

(1338, 7)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   idade     1338 non-null   int64  
 1   gênero    1338 non-null   object 
 2   imc       1338 non-null   float64
 3   filhos    1338 non-null   int64  
 4   fumante   1338 non-null   object 
 5   região    1338 non-null   object 
 6   encargos  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
idade,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
imc,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
filhos,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
encargos,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


## Label Encoder

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

dataset['genero_tipo'] = label_encoder.fit_transform(dataset['gênero'])
dataset['fumante_tipo'] = label_encoder.fit_transform(dataset['fumante'])
dataset['regiao_tipo'] = label_encoder.fit_transform(dataset['região'])

dataset.head()

Unnamed: 0,idade,gênero,imc,filhos,fumante,região,encargos,genero_tipo,fumante_tipo,regiao_tipo
0,19,female,27.9,0,yes,southwest,16884.924,0,1,3
1,18,male,33.77,1,no,southeast,1725.5523,1,0,2
2,28,male,33.0,3,no,southeast,4449.462,1,0,2
3,33,male,22.705,0,no,northwest,21984.47061,1,0,1
4,32,male,28.88,0,no,northwest,3866.8552,1,0,1


In [7]:
dataset_tratado = dataset.drop(columns = ["fumante", "região", "gênero"]).copy()

dataset_tratado.head()

Unnamed: 0,idade,imc,filhos,encargos,genero_tipo,fumante_tipo,regiao_tipo
0,19,27.9,0,16884.924,0,1,3
1,18,33.77,1,1725.5523,1,0,2
2,28,33.0,3,4449.462,1,0,2
3,33,22.705,0,21984.47061,1,0,1
4,32,28.88,0,3866.8552,1,0,1


In [8]:
corr_matrix = dataset_tratado.corr()
corr_matrix["encargos"].sort_values(ascending=False)

encargos        1.000000
fumante_tipo    0.787251
idade           0.299008
imc             0.198341
filhos          0.067998
genero_tipo     0.057292
regiao_tipo    -0.006208
Name: encargos, dtype: float64

## Preparando os dados para colocar no algoritmo

In [9]:
X = dataset_tratado.drop("encargos", axis=1) # apagando a target para a base de treino (nosso x)
y = dataset_tratado["encargos"].copy() #armazenando a target (nosso y)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train.head()

Unnamed: 0,idade,imc,filhos,genero_tipo,fumante_tipo,regiao_tipo
560,46,19.95,2,0,0,1
1285,47,24.32,0,0,0,0
1142,52,24.86,0,0,0,2
969,39,34.32,5,0,0,2
486,54,21.47,3,0,0,1


In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score

model_results = []

#LinearRegression()
modelo = LinearRegression()
modelo.fit(X_train, y_train)
predictions = modelo.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

errors = np.abs(y_test - predictions)
relative_errors = errors / np.abs(y_test)
mape = np.mean(relative_errors) * 100

model_results.append({
        'model': modelo.__class__.__name__,
        'mae': mae.round(2),
        'mse': mse.round(2),
        'rmse': rmse.round(2),
        'r2': r2.round(4),
        'mape': mape.round(2)
    })

#DecisionTreeRegressor()
modelo = DecisionTreeRegressor()
modelo.fit(X_train, y_train)
predictions = modelo.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

errors = np.abs(y_test - predictions)
relative_errors = errors / np.abs(y_test)
mape = np.mean(relative_errors) * 100

model_results.append({
        'model': modelo.__class__.__name__,
        'mae': mae.round(2),
        'mse': mse.round(2),
        'rmse': rmse.round(2),
        'r2': r2.round(4),
        'mape': mape.round(2)
    })

#RandomForestRegressor()
modelo = RandomForestRegressor()
modelo.fit(X_train, y_train)
predictions = modelo.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

errors = np.abs(y_test - predictions)
relative_errors = errors / np.abs(y_test)
mape = np.mean(relative_errors) * 100

model_results.append({
        'model': modelo.__class__.__name__,
        'mae': mae.round(2),
        'mse': mse.round(2),
        'rmse': rmse.round(2),
        'r2': r2.round(4),
        'mape': mape.round(2)
    })

model_results_df = pd.DataFrame(model_results)
model_results_df.head()

Unnamed: 0,model,mae,mse,rmse,r2,mape
0,LinearRegression,4186.51,33635210.43,5799.59,0.7833,47.09
1,DecisionTreeRegressor,3153.78,49002087.99,7000.15,0.6844,34.94
2,RandomForestRegressor,2533.31,21021760.31,4584.95,0.8646,30.27


***Estratégia 1: Observaremos o comportamento dos modelos retirando-se os 25% da base cujo encargo é maior que o 75th percetile***

In [13]:
def recuperar_valor_quartil(data, quartil):
    """
    Calcula o valor do quartil especificado para os dados fornecidos.

    Parâmetros:
        data (array): Array contendo os dados.
        quartil (int): Número do quartil desejado (25, 50 ou 75).

    Retorna:
        float: Valor do quartil especificado.
    """
    return np.percentile(data, quartil)

Q3 = recuperar_valor_quartil(dataset_tratado['encargos'], 75)

dataset_tratado_ajustado = dataset_tratado[dataset_tratado['encargos'] <= Q3]

dataset_tratado_ajustado.reset_index(drop=True, inplace=True)

X = dataset_tratado_ajustado.drop("encargos", axis=1) # apagando a target para a base de treino (nosso x)
y = dataset_tratado_ajustado["encargos"].copy() #armazenando a target (nosso y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train.describe()

Unnamed: 0,idade,imc,filhos,genero_tipo,fumante_tipo,regiao_tipo
count,802.0,802.0,802.0,802.0,802.0,802.0
mean,38.754364,30.483635,1.053616,0.492519,0.016209,1.537406
std,14.13077,6.093927,1.212036,0.500256,0.126359,1.119221
min,18.0,15.96,0.0,0.0,0.0,0.0
25%,26.0,26.03,0.0,0.0,0.0,1.0
50%,39.0,30.2,1.0,0.0,0.0,2.0
75%,51.0,34.21,2.0,1.0,0.0,3.0
max,64.0,53.13,5.0,1.0,1.0,3.0


In [15]:
X_test.describe()

Unnamed: 0,idade,imc,filhos,genero_tipo,fumante_tipo,regiao_tipo
count,201.0,201.0,201.0,201.0,201.0,201.0
mean,39.378109,30.305348,1.119403,0.462687,0.029851,1.472637
std,14.092775,6.473094,1.24325,0.499851,0.1706,1.109278
min,18.0,16.815,0.0,0.0,0.0,0.0
25%,27.0,25.745,0.0,0.0,0.0,1.0
50%,38.0,30.02,1.0,0.0,0.0,1.0
75%,52.0,34.58,2.0,1.0,0.0,2.0
max,64.0,48.07,5.0,1.0,1.0,3.0


In [16]:
model_results = []

#LinearRegression()
modelo = LinearRegression()
modelo.fit(X_train, y_train)
predictions = modelo.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

errors = np.abs(y_test - predictions)
relative_errors = errors / np.abs(y_test)
mape = np.mean(relative_errors) * 100

model_results.append({
        'model': modelo.__class__.__name__,
        'mae': mae.round(2),
        'mse': mse.round(2),
        'rmse': rmse.round(2),
        'r2': r2.round(4),
        'mape': mape.round(2)
    })

#DecisionTreeRegressor()
modelo = DecisionTreeRegressor()
modelo.fit(X_train, y_train)
predictions = modelo.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

errors = np.abs(y_test - predictions)
relative_errors = errors / np.abs(y_test)
mape = np.mean(relative_errors) * 100

model_results.append({
        'model': modelo.__class__.__name__,
        'mae': mae.round(2),
        'mse': mse.round(2),
        'rmse': rmse.round(2),
        'r2': r2.round(4),
        'mape': mape.round(2)
    })

#RandomForestRegressor()
modelo = RandomForestRegressor()
modelo.fit(X_train, y_train)
predictions = modelo.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

errors = np.abs(y_test - predictions)
relative_errors = errors / np.abs(y_test)
mape = np.mean(relative_errors) * 100

model_results.append({
        'model': modelo.__class__.__name__,
        'mae': mae.round(2),
        'mse': mse.round(2),
        'rmse': rmse.round(2),
        'r2': r2.round(4),
        'mape': mape.round(2)
    })

model_results_df = pd.DataFrame(model_results)
model_results_df.head()

***Estratégia 2: Utilizaremos upsampling para aumentar a nossa amostra original e verificar o comportamento dos modelos***

In [None]:
dataset_tratado.describe().T

In [None]:
upsample_dataset_tratado = dataset_tratado.sample(n=100000, replace=True, random_state=42, ignore_index=True)

upsample_dataset_tratado.describe().T

In [None]:
X = upsample_dataset_tratado.drop("encargos", axis=1) # apagando a target para a base de treino (nosso x)
y = upsample_dataset_tratado["encargos"].copy() #armazenando a target (nosso y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model_results = []

#LinearRegression()
modelo = LinearRegression()
modelo.fit(X_train, y_train)
predictions = modelo.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

errors = np.abs(y_test - predictions)
relative_errors = errors / np.abs(y_test)
mape = np.mean(relative_errors) * 100

model_results.append({
        'model': modelo.__class__.__name__,
        'mae': mae.round(2),
        'mse': mse.round(2),
        'rmse': rmse.round(2),
        'r2': r2.round(4),
        'mape': mape.round(2)
    })

#DecisionTreeRegressor()
modelo = DecisionTreeRegressor()
modelo.fit(X_train, y_train)
predictions = modelo.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

errors = np.abs(y_test - predictions)
relative_errors = errors / np.abs(y_test)
mape = np.mean(relative_errors) * 100

model_results.append({
        'model': modelo.__class__.__name__,
        'mae': mae.round(2),
        'mse': mse.round(2),
        'rmse': rmse.round(2),
        'r2': r2.round(4),
        'mape': mape.round(2)
    })

#RandomForestRegressor()
modelo = RandomForestRegressor()
modelo.fit(X_train, y_train)
predictions = modelo.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

errors = np.abs(y_test - predictions)
relative_errors = errors / np.abs(y_test)
mape = np.mean(relative_errors) * 100

model_results.append({
        'model': modelo.__class__.__name__,
        'mae': mae.round(2),
        'mse': mse.round(2),
        'rmse': rmse.round(2),
        'r2': r2.round(4),
        'mape': mape.round(2)
    })

model_results_df = pd.DataFrame(model_results)
model_results_df.head()