In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import scipy as sc
import warnings 
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
data.info()

# *Check for missing or duplicate data*

In [None]:
data.isna().sum()

*Let's remember that missing data is not a problem for xgboost.*

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.duplicated().sum()

# **LabelEncoder**

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()

In [None]:
data['smoker'] = encoder.fit_transform(data['smoker'])
print(encoder.classes_)

In [None]:
data['region'] = encoder.fit_transform(data['region'])
print(encoder.classes_)

In [None]:
data['sex'] = encoder.fit_transform(data['sex'])
print(encoder.classes_)

In [None]:
data.head()

# **Correlation between variables**

In [None]:
fig = plt.figure(figsize=(9,6), dpi=90)
ax = fig.gca()

sns.heatmap(data.corr('spearman'), annot=True, cmap='bwr', cbar=True)
ax.set_title('Corr Data')
plt.show()

***Let's use Spearman's correlation; since this is used when we have variables that suffer from outliers and categorical variables.***

*The correlation values ​​may have been given at random, let's use the pingouin library to verify which variables have a significant correlation.*

In [None]:
!pip install pingouin

In [None]:
import pingouin as pg

*r --> correlation value; p-unc --> p-value
if p-unc (p-value) is greater than 0.05 then the correlation of this variable is not significant.*

In [None]:
p_corr = pg.pairwise_corr(data, method='spearman')[['X', 'Y', 'r', 'p-unc']]

p_corr = p_corr[p_corr['p-unc']<=0.05]

p_corr[p_corr['Y']=='charges']

> *We have these four extremely important variables to predict the medical cost of patients; however let's verify if the others influence or have any information.*

In [None]:
data.head()

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

sns.violinplot(data['smoker'], data['charges'], hue=data['sex'], ax=ax, split=True)
plt.show()

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

sns.violinplot(data['region'], data['charges'], hue=data['sex'], ax=ax, split=True)
plt.show()

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

sns.violinplot(data['children'], data['charges'], hue=data['sex'], ax=ax, split=True)
plt.show()

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

sns.violinplot(data['smoker'], data['bmi'], hue=data['sex'], ax=ax, split=True)
plt.show()

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

sns.violinplot(data['children'], data['bmi'], hue=data['sex'], ax=ax, split=True)
plt.show()

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

sns.violinplot(data['region'], data['bmi'], hue=data['sex'], ax=ax, split=True)
plt.show()

> *Apparently the variables sex and region do not contain important information for the prediction and even the number of children (but we will use this for this example)*

# **Data Leakage**

> *In the data set we have variables that could be contaminating our data in some way; for example sex, at the time of quoting a medical cost the sex of a person is not important, but rather what they have. Also for the region (the data are from a single country, no problem; if they are from different countries, if it is important, they could have valuable information). Even the number of children is not important, but for this example we will use it.*

In [None]:
dta = data[['age', 'bmi', 'children', 'smoker', 'charges']]

dta.head()

**Let's explore our important variables.**

In [None]:
for i in ['age', 'bmi', 'charges']:
    
    fig = plt.figure(figsize=(14,5), dpi=70)
    ax = fig.subplots(1,2)
    
    sns.boxplot(dta[i], ax=ax[0])
    ax[0].set_title(f"Box: {i}")
    
    sns.distplot(dta[i], ax=ax[1], fit=sc.stats.norm)
    ax[1].set_title(f"Dist: {i}")

*Let's check if these have a normal or skewed distribution.*

In [None]:
for i in ['age', 'bmi', 'charges']:
    sta, pval = sc.stats.shapiro(dta[i])
    
    print(f"{i.upper()}:\nStats: {sta:.3f}\tP-value: {pval:.5f}")

> Significance test to verify if our continuous variables have a normal distribution.

> *  If the p_value is less than 0.05, our variable has a normal distribution or at least meets some properties to be considered one.



> * If the p_value is greater than 0.05 our variable does not meet sufficient properties to be a normal distribution.

In [None]:
dta.head()

**Let's explore our continuous variables.***

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

ax.scatter(dta['age'], dta['charges'], c=dta['smoker']+2, alpha=0.7,s=15)
ax.set_xlabel('Age')
ax.set_ylabel('Charger')
ax.set_title('smoker')
plt.show()


In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

ax.scatter(dta['bmi'], dta['charges'], c=dta['children']+2, alpha=0.7,s=15)
ax.set_xlabel('bmi')
ax.set_ylabel('Charger')
ax.set_title('children')
plt.show()

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

ax.scatter(dta['age'], dta['bmi'], c=dta['children']+2, alpha=0.7,s=15)
ax.set_xlabel('age')
ax.set_ylabel('bmi')
ax.set_title('children')
plt.show()

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

ax.scatter(dta['age'], dta['bmi'], c=dta['smoker']+2, alpha=0.7,s=15)
ax.set_xlabel('age')
ax.set_ylabel('bmi')
ax.set_title('smoker')
plt.show()

> As we can see, smoker is the variable that provides the most information.

In [None]:
dta.head()

In [None]:
X, y = dta.loc[:,['age', 'bmi', 'smoker', 'children']], dta.iloc[:,-1]

In [None]:
from sklearn.decomposition import PCA

# **PCA**

***Let's reduce the dimensionality to get an idea of ​​how our data is distributed.***

In [None]:
pca = PCA(n_components=1)

In [None]:
dta_pca = pca.fit_transform(X)

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

sns.scatterplot(dta_pca[:,0], y, alpha=0.7, hue=X['smoker'])
ax.set_xlabel('Comp PCA')
ax.set_title('PCA')

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

sns.scatterplot(dta_pca[:,0], y, alpha=0.7, hue=X['children'])
ax.set_xlabel('Comp PCA')
ax.set_title('PCA')

*Esta distribución de nuestros datos ya la habíamos visto más arriba.*

# **XGBoost --> Regressor**

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

***train test split***

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train.head()

*We create a function that returns the mae (due to the presence of outliers), score, mae-cross_val_score.*

In [None]:
def score(model, X_, y_, cv):
    
    mae = mean_absolute_error(y_, model.predict(X_))
    
    sco = model.score(X_, y_)
    
    cross = -1*cross_val_score(model, X_, y_, cv=cv, scoring='neg_mean_absolute_error')
    
    print(f'MAE: {mae:.3f}\tSCORE: {sco:.3f}\nCross-val-score MAE\t-->\tcv: {cv}\t-->\t{cross}')

In [None]:
model = XGBRegressor(n_estimators=60, max_depth=3, learning_rate=0.1, colsample_bytree=1,
                                base_score=1500,
                                objective='reg:tweedie', tweedie_variance_power=1.5554)
model.fit(X_train, y_train)

score(model, X_test, y_test, cv=3)


> If you want to modify the model or improve its precision, try the hyperparameters that are in the code -> model (these)

In [None]:
score(model, X_train, y_train, cv=3)

In [None]:
score(model, X, y, cv=6)

In [None]:
from xgboost import plot_tree,plot_importance

**Feature Importances**

In [None]:
fig = plt.figure(figsize=(9,5), dpi=70)
ax = fig.gca()

plot_importance(model, ax=ax)

**Plot Trees**

In [None]:
for i in range(0,60):

    fig = plt.figure(figsize=(9,5), dpi=220)
    ax = fig.gca()

    plot_tree(model, ax=ax, num_trees=i)
    ax.set_title(f"tree: {i+1}")