In [None]:
!pip install comet_ml

In [None]:
# import comet_ml at the top of your file
from comet_ml import Experiment

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
comet_api_key = user_secrets.get_secret("comet_api_key")


In [None]:
! pip freeze | grep seaborn

In [None]:
! pip install -U seaborn

In [None]:
import seaborn as sns
sns.__version__

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection  import train_test_split

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('/kaggle/input/insurance/insurance.csv')
data.head()

In [None]:
data.shape

# EDA

In [None]:
data.isnull().sum()

### Insight: no missing values

In [None]:
data.describe()

### Insights:
- charges seems to be right skewed (median < mean) 
- age, bmi, children seem to be normally distributed 

In [None]:
data.sex.value_counts()

In [None]:
data.smoker.value_counts()

In [None]:
data.region.value_counts()

### Insight:
- smoker is unbalanced, more people are non-smokers
- sex, region, seems to be balanced

## relationship with target variable

In [None]:
sns.distplot(data.charges)

In [None]:
sns.displot(data=data, 
            x='charges',
            row='sex',
            col='region',
            hue='smoker',
            fill=True,
            multiple='stack',
            kind='kde')

learn more plots [here](https://seaborn.pydata.org/tutorial/distributions.html)

In [None]:
var = 'sex'
mean_data = data.groupby(var).charges.mean()
print(mean_data)
print(mean_data.diff())
sns.violinplot(data=data, x=var, y='charges')
plt.title('Distribution of target against '+var)
plt.show()

In [None]:
var = 'smoker'
mean_data = data.groupby(var).charges.mean()
print(mean_data)
print(mean_data.diff())
sns.violinplot(data=data, x=var, y='charges')
plt.title('Distribution of target against '+var)
plt.show()

In [None]:
var = 'region'
mean_data = data.groupby(var).charges.mean()
print(mean_data)
#print(mean_data.diff())
sns.violinplot(data=data, x=var, y='charges')
plt.title('Distribution of target against '+var)
plt.show()

### Insight:
- sex, region do not seem to have much impact on the target
- smoker does seem to have huge impact

In [None]:
sns.pairplot(data,
            hue='smoker')

## Hypothesis



We have already visualized the relationship of the variables to the charges. Now we will further investigate by looking at the relationships using multiple linear regression. Remember that the aim of this section is to quantify the relationship and not to create the prediction model. Let us first create a training and testing data set to proceed.

Based on the visualization, we can make a couple of hypothesis about the relationship.

    There is no real difference in charges between gender or regions.

    The charge for smokers are very much higher than the non-smokers.

    The charge gets higher as the individual gets older.

    The charge gets higher as the individual reaches over 30BMI.

    Lastly, the charge is higher for those who have fewer number of children.



# Preprocessing

In [None]:
# categoricals to numerical

data = pd.get_dummies(data, prefix=['sex','smoker','region'], drop_first=True)
data.head()

In [None]:
# split train-test
X = data.drop(columns='charges') 
y = data.loc[:,'charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Quantify effects

In [None]:
import statsmodels.api as sm
from scipy import stats

X_train_const = sm.add_constant(X_train)
linearModel = sm.OLS(y_train, X_train_const)
linear = linearModel.fit()
print(linear.summary())


1. There is no real difference in charges between gender (p-value 0.907) or regions (p-value 0.342, 0.093, 0.173).
    * since all the p-values > 0.05 that means these variables do not have statistical significane on the target variable

  
2. The charge for smokers are very much higher than the non-smokers (p-value 0.000)
    * since p-value < 0.05 this variable is statistically significant 
  
  
3. The charge gets higher as the individual gets older (p-value 0.000).
    * since p-value < 0.05 this variable is statistically significant
  
  
4. The charge gets higher as the individual reaches over 30BMI (p-value 0.000).
    * since p-value < 0.05 this variable is statistically significant
  
  
5. Lastly, the charge is higher for those who have fewer number of children (p-value 0.005).
    * since p-value < 0.05 this variable is statistically significant, meaning there is evidence that charges are different for people with fewer than people with more children
      
      



# Build model

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

see pipeline and gridsearchcv examples [here](https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html#sphx-glr-auto-examples-compose-plot-compare-reduction-py) and [here](https://www.kaggle.com/carlosdg/xgboost-with-scikit-learn-pipeline-gridsearchcv)

In [None]:
pipeline = Pipeline([
    ('scaling', 'passthrough'),
    ('model','passthrough')
])

param_grid = {
    'scaling': [StandardScaler(), MinMaxScaler()],
    'model' : [LinearRegression(), Ridge(), Lasso(), ElasticNet()]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.score(X_test, y_test)

## track experiments on comet_ml 
learn more [here](https://www.comet.ml/docs/python-sdk/scikit/)

In [None]:

for i in range(len(grid.cv_results_['params'])):
    exp = Experiment(workspace="maksteel",
        project_name="saturday-codealong-medical-insurance-costs-predict",
        api_key=comet_api_key)
    for k,v in grid.cv_results_.items():
        if k == "params":
            exp.log_parameters(v[i])
        else:
            exp.log_metric(k,v[i])
    exp.end()

see experiemnt [here](https://www.comet.ml/maksteel/saturday-codealong-medical-insurance-costs-predict/view/5hAlaM4RsxzDY632TAi7rIH9D)