In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv('/kaggle/input/insurance/insurance.csv');

In [None]:
!pip install comet_ml

In [None]:
# import comet_ml at the top of your file
from comet_ml import Experiment

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
comet_api_key = user_secrets.get_secret("comet-label")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
! pip freeze | grep seaborn

#Exploratory Analysis

In [None]:
pip install -U seaborn


In [None]:
import seaborn as sns
sns.__version__

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split

EDA¶

In [None]:
data.isnull().sum()

Insights : No missing values

In [None]:
data.describe()

## Insights
#Suggests some skewness in charges: Mode 9382.   Right skewed (median < Mean)
#Age seems to be normally distributed
#All others  - BMI, Children also normally distributed

Catagorical ones

In [None]:
data.sex.value_counts() 

In [None]:
data.region.value_counts()

In [None]:
data.smoker.value_counts()

Insights
#sex,region balanced
#Smoker is imbalanced

See relationships with target variable

In [None]:
sns.distplot(data.charges)

In [None]:
sns.displot(data=data, 
            x='charges',
            row='sex',
            col='region',
            hue='smoker',
            fill=True,
            multiple='stack',
            kind='kde')

learn more plots here 
https://seaborn.pydata.org/tutorial/distributions.html

In [None]:
f= plt.figure(figsize=(12,5))

ax=f.add_subplot(121)
sns.distplot(data[(data.smoker == "yes")]["charges"],color='c',ax=ax)
ax.set_title('Distribution of charges for smokers')

ax=f.add_subplot(122)
sns.distplot(data[(data.smoker == "no")]["charges"],color='b',ax=ax)
ax.set_title('Distribution of charges for non-smokers')




In [None]:
#Voilin plots

In [None]:
var = 'sex'
mean_data = data.groupby(var).charges.mean()
print(mean_data)
print(mean_data.diff())
sns.violinplot(data=data, x=var, y='charges')
plt.title('Distribution of target against '+var)
plt.show()

In [None]:
#smoker
var = 'smoker'
mean_data = data.groupby(var).charges.mean()
print(mean_data)
print(mean_data.diff())
sns.violinplot(data=data, x=var, y='charges')
plt.title('Distribution of target against '+var)
plt.show()

In [None]:
var = 'region'
mean_data = data.groupby(var).charges.mean()
print(mean_data)
#print(mean_data.diff())
sns.violinplot(data=data, x=var, y='charges')
plt.title('Distribution of target against '+var)
plt.show()

Insights
* sex, region do not seem to have much impact on the target
* smoker has high  impact

In [None]:
sns.pairplot(data)

In [None]:
sns.pairplot(data,hue='smoker')

Insights
* smokers have more charges
* higher charges with age
* bmi has influence on charges
* higher if no children?

Hypothesis¶

We have already visualized the relationship of the variables to the charges. Now we will further investigate by looking at the relationships using multiple linear regression. The aim of this section is to quantify the relationship and not to create the prediction model. 

Based on the visualization, we can make a couple of hypothesis about the relationship.


* There is no influence of gender or region on onn charges.

* The charge for smokers are very much higher than for the non-smokers.

* The charge gets higher with age.

* The charge gets higher as the individual reaches 30BMI or above.

* Lastly, the charge is higher for those who have fewer number of children.

Let us first create a training and testing data set to proceed.

# Preprocessing¶

In [None]:
#Form the hypothesis - form OLS
#before that convert catagoricals

data = pd.get_dummies(data, prefix=['sex','smoker','region'], drop_first=True)
data.head()


In [None]:
# split train-test
X = data.drop(columns='charges') 
y = data.loc[:,'charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Quantify effects¶

In [None]:
import statsmodels.api as sm
from scipy import stats

In [None]:
#Fit OLD model - needs the constant to be passed

X_train_const = sm.add_constant(X_train)
linearModel = sm.OLS(y_train, X_train_const)
linear = linearModel.fit()
print(linear.summary())

# Conclusions:
    
There is no real difference in charges between gender (p-value 0.907) or regions (p-value 0.342, 0.093, 0.173).
* since all the p-values > 0.05 that means these variables do not have statistical significane on the target variable

The charge for smokers are very much higher than the non-smokers (p-value 0.000)
* since p-value < 0.05 this variable is statistically significant

The charge gets higher as the individual gets older (p-value 0.000).
* since p-value < 0.05 this variable is statistically significant

The charge gets higher as the individual reaches over 30BMI (p-value 0.000).
* since p-value < 0.05 this variable is statistically significant

Lastly, the charge is higher for those who have fewer number of children (p-value 0.005).
* since p-value < 0.05 this variable is statistically significant, meaning there is evidence that charges are different for people with fewer than people with more children

# Build model¶

In [None]:
#Build pipeline and use GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

see pipeline and gridsearchcv examples [here](https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html#sphx-glr-auto-examples-compose-plot-compare-reduction-py) and [here](https://www.kaggle.com/carlosdg/xgboost-with-scikit-learn-pipeline-gridsearchcv)
    

In [None]:
pipeline = Pipeline([
    ('scaling', 'passthrough'),
    ('model','passthrough')
])

param_grid = {
    'scaling': [StandardScaler(), MinMaxScaler()],
    'model' : [LinearRegression(), Ridge(), Lasso(), ElasticNet()]
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.score(X_test, y_test)

# Track experiments on comet_ml¶
learn more [here](https://www.comet.ml/docs/python-sdk/scikit/) 

In [None]:
for i in range(len(grid.cv_results_['params'])):
    exp = Experiment(workspace="yaligarp",
        project_name="saturday-codealong-medical-insurance-costs-predict",
        api_key=comet_api_key)
    for k,v in grid.cv_results_.items():
        if k == "params":
            exp.log_parameters(v[i])
        else:
            exp.log_metric(k,v[i])
    exp.end()

1. see experiemnt [here](https://www.comet.ml/yaligarp/saturday-codealong-medical-insurance-costs-predict/e961de32e76047cf8e487f252c24ae85?experiment-tab=chart&showOutliers=true&smoothing=0&transformY=smoothing&xAxis=wall) 