# Context
Medical Cost Insurance. Can you accurately predict insurance costs?

In [None]:
#Import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Read the data
data= pd.read_csv("/kaggle/input/insurance/insurance.csv")

In [None]:
data.head(5)

In [None]:
data.shape

In [None]:
data.info()

**We have 1338 total rows and 7 columns.**
3 categorical:

*   sex
*   smoker
*   region

4 numerical:

*   age
*   bmi
*   children
*   charges





In [None]:
data.describe(include="all").T

# Missing values




In [None]:
#Missing values?
data.isnull().sum()

---
**No missing values!**


In [None]:
data.loc [data.duplicated (),:]

# Pairplot

In [None]:
sns.pairplot(data)

In [None]:
sns.scatterplot(x="age",y="charges",data=data, color="b").set(title='AGE vs CHARGES')

In [None]:
sns.scatterplot(x="bmi",y="charges",data=data,color="b").set(title='BMI vs CHARGES')

# Outliers

In [None]:
data =data[(data.charges<=50000)]

In [None]:
sns.scatterplot(x="age",y="charges",data=data, color="g").set(title='AGE vs CHARGES')

In [None]:
sns.scatterplot(x="bmi",y="charges",data=data, color="g").set(title='BMI vs CHARGES')

# Encoding
One hote encoding

In [None]:
#Sex
sexdummy=pd.get_dummies(data.sex,prefix="sex",drop_first=True)
data = pd.concat([data, sexdummy], axis=1)
data.drop(["sex"],axis=1, inplace=True)

In [None]:
#Smoker
smodummy=pd.get_dummies(data.smoker,prefix="Smoker", drop_first=True)
data = pd.concat([data, smodummy], axis=1)
data.drop(["smoker"],axis=1, inplace=True)

In [None]:
#Region
regdummy=pd.get_dummies(data.region,prefix="region",drop_first=True)
data = pd.concat([data, regdummy], axis=1)
data.drop(["region"],axis=1, inplace=True)

In [None]:
data.head()

# Correlation

In [None]:
data.corr()["charges"]

In [None]:
sns.set(rc={'figure.figsize':(15,12)})
sns.heatmap(data.corr(),annot=True)

**Observations:**

* There is a strong relationship between the variables Smoker and Age with our target


# EDA

In [None]:
#Target distribution
sns.distplot(data.charges.values, color="violet")
mean= data.charges.mean()
plt.axvline(mean,0,1,color="red")

Most people spend on cheap charges

In [None]:
#Target distribution for smokers
f= plt.figure(figsize=(12,5))

ax=f.add_subplot(121)
sns.distplot(data[data.Smoker_yes==1]["charges"], color="red")
ax.set_title('Distribution of charges for smokers')

ax=f.add_subplot(122)
sns.distplot(data[data.Smoker_yes==0]["charges"])
ax.set_title('Distribution of charges for not-smokers')

For smokers we see that they spend on more expensive charges (> 30,000) and that also many spend on charges either cheap (like everyone else) or very expensive (bimodal graph).
Non-smokers spend much cheaper charges

In [None]:
#Smokers
sns.catplot(x="Smoker_yes", kind="count",hue = 'sex_male', palette="magma", data=data).set(title="SMOKERS")
print(data.Smoker_yes.value_counts())
data[(data.sex_male==0) & (data.Smoker_yes==1)].shape[0]

We can see that we have a majority of non-smokers, and for smokers we have that 58% are men.
It can be assumed that the cost of treatment for men will be higher than for women, given the impact of smoking.

In [None]:
#Sex-Smoke
sns.catplot(x="sex_male", y="charges", hue="Smoker_yes",
            kind="violin", data=data, palette = 'magma').set(title="Sex vs. Smoke, and how it affects the cost")

It is seen again that smokers cover much larger cost ranges

In [None]:
#Females
sns.boxplot(x="charges",y="Smoker_yes", data=data[(data.sex_male==0)], orient="h", palette="autumn").set(title="FEMALES")

In [None]:
#Males
sns.boxplot(x="charges",y="Smoker_yes", data=data[(data.sex_male==1)], orient="h", palette="cool").set(title="MALES")

In [None]:
#Age
ax = sns.distplot(data["age"], color = 'orange')

A lot of young people

In [None]:
#Do young people smoke?
sns.catplot(x="Smoker_yes",kind="count", hue="sex_male", data=data[(data.age==18)]).set(title="Do young people smoke?")

Yes, they do. Lets see.

In [None]:
sns.boxplot(x="charges",y="Smoker_yes", data=data[(data.age==18)], orient="h", palette="pink").set(title="Smoking at 18yrs affects the cost of treatment")

In non-smokers, we can see outliers. What 18-year-old non-smoker would want to pay for expensive treatment? Maybe someone with a complicated illness or an accident.

In [None]:
sns.scatterplot(x="age",y="charges",color="r",data=data[data.Smoker_yes==1]).set(title="How age of smokers affect the cost?")

Increase the cost logically.
We can see two separate point clouds, there must be a variable that divided them ...


Let's investigate BMI for a moment

In [None]:
#BMI
ax = sns.distplot(data["bmi"], color = 'brown')

We have to understand the BMI. From 30 the Obesity begins.
Then you can take a look at the treatments of patients with <30 and> 30 BMI

In [None]:
#Obese people ()>30) charges
sns.displot(data[(data.bmi >= 30)]['charges'], color="black").set(title="Costs of obese people")

In [None]:
#Rest
sns.displot(data[(data.bmi < 30)]['charges'], color='grey').set(title="Costs of people BMI<30")

Obese people spend much more on treatment!

In [None]:
#Obese sexs ()>30) charges
sns.catplot(x="sex_male",kind="count",palette="cividis", data=data[(data.bmi>=30)]).set(title="Obese sexs")

Now, Smoke and Obesity relationship...

In [None]:
sns.lmplot(x='bmi',y='charges',data=data,palette='Greys',hue='Smoker_yes').set(title="Smoke and BMI vs Cost")

People with a high BMI and smokers pay much more expensive treatments

Now, returning to the relationship of smokers and age, what if we consider the BMI feature?

In [None]:
sns.scatterplot(x='age',y='charges',data=data[data.Smoker_yes==1],palette='bone',hue="bmi",size="bmi",sizes=(20,300)).set(title="How age of smokers affect the cost?")

There is! High BMI smokers spend much more than low BMI smokers as they get older

Now let's see the cildren

In [None]:
#Children
sns.catplot(x="children", kind="count", palette="autumn", data=data, size = 6)

Most of the patients do not have children.

In [None]:
#Do people who have children smoke?
sns.catplot(x="Smoker_yes", kind="count", palette="hot",hue = "sex_male",
            data=data[(data.children > 0)], size = 6).set(title="Do people who have children smoke?")

Yes, but not so many

In [None]:
#What's about obesity?
sns.catplot(x="children", kind="count", palette="hot",hue = "sex_male",
            data=data[(data.bmi>=30)], size = 6).set(title="What's about obesity?")

# Model

In [None]:
from sklearn.model_selection import train_test_split

seed=14


from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
lr = LinearRegression(n_jobs = -1)
lasso = Lasso()
ridge = Ridge()
dt = DecisionTreeRegressor()
svr = SVR()
knn = KNeighborsRegressor(n_jobs= -1)
rf = RandomForestRegressor()
ab = AdaBoostRegressor()
gb = GradientBoostingRegressor()
xgb = XGBRegressor(n_jobs = -1)
lgb = LGBMRegressor(n_jobs = -1)

In [None]:
X= data.drop(["charges"],axis=1)
Y= data.charges
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2)
X

In [None]:
from seaborn.external.husl import rgb_prepare
'''Training accuracy of our regression models. By default score method returns coefficient of determination (r_squared).'''
def entrenar(model):
    model.fit(X_train, y_train)
    return (model.score(X_train, y_train),model.score(X_test, y_test))

models = {'LinearRegression': lr,
          'Lasso': knn,
          'Rige': ridge,
          'DecisionTreeRegressor': dt,
          'SVR':svr,
          'KNN':knn,
          'RandomForestRegressor':rf,
          'AdaBoostRegressor':ab,
          'XGB': xgb,
          'GradientBoostingRegressor': gb,
          'LGBMRegressor': lgb}
scores={}
train_scores={}
for nombre, modelo in models.items():
    scores[nombre]=entrenar(modelo)[1]
    train_scores[nombre]=entrenar(modelo)[0]

scores = pd.DataFrame(scores,index=['Score']).transpose()
scores = scores.sort_values('Score')

train_scores = pd.DataFrame(train_scores,index=['Train Score']).transpose()
train_scores = train_scores.sort_values('Train Score')

In [None]:
cm = sns.color_palette('flare',as_cmap=True)
scores = scores.style.background_gradient(cmap=cm)
scores

In [None]:
train_scores.style.background_gradient(cmap=cm)

be careful with overfitting!

In [None]:
gb.get_params()