In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/insurance/insurance.csv")


### Columns

- age: age of primary beneficiary

- sex: insurance contractor gender, female, male

- bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

- children: Number of children covered by health insurance / Number of dependents

- smoker: Smoking

- region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

- charges: Individual medical costs billed by health insurance

In [None]:
data.head()

## **EDA - Understanding Data**

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
import matplotlib.pyplot as plt
plt.boxplot(x=data.charges)
plt.xlabel("charges")
plt.plot()
#outliers present many 

In [None]:
plt.hist(x=data.charges)
plt.xlabel("charges")
plt.plot()
#right skewed 

In [None]:
plt.boxplot(x=data.bmi)
plt.xlabel("BMI")
plt.plot()
#outliers present bmi >50 which can affect the model fit

In [None]:
plt.hist(x=data.bmi)
plt.xlabel("BMI")
plt.plot()

#normal distribution overall

## Without log transform

In [None]:
plt.scatter(x=data.bmi, y=data.charges)
plt.xlabel("BMI")
plt.ylabel("charges")
plt.plot()
#x-independent, y-dependent

In [None]:
# difficult to conclude weather there is a linear relationship
# a part of the graph looks like linear but cannot be directly used to fit a linear relationship

In [None]:
# seaborn.regplot

import seaborn as sns

sns.regplot(x=data.bmi, y=data.charges)

In [None]:
import statsmodels.api as sm
mod = sm.OLS(data.charges,data.bmi)
res = mod.fit()
print(res.summary())

## With log tranform

In [None]:
plt.scatter(x=data.bmi, y=np.log(data.charges))
plt.xlabel("BMI")
plt.ylabel("charges")
plt.plot()
#x-independent, y-dependent with log transform of y

In [None]:
# seaborn.regplot

import seaborn as sns

sns.regplot(x=data.bmi, y=np.log(data.charges))
#little better than before but needs work

In [None]:
import statsmodels.api as sm
mod = sm.OLS(np.log(data.charges),data.bmi)
res = mod.fit()
print(res.summary())

## Running the scatter plot against other paratmeters to find a relation

In [None]:
scatter_plot_is_smoker =sns.scatterplot(x="bmi", y="charges",
              hue="smoker",
              data=data)

- clearly we can see that there is a flat relation between non smoker and insurance charges,
- while a strong linear relationship between insurances charges and sample that falls under smoking category

In [None]:
scatter_is_smoker_true = sns.scatterplot(x="bmi", y="charges",
              data=data.loc[data['smoker'] == 'yes'])

In [None]:
plt.hist(x=data.loc[data['smoker'] == 'yes'].charges)
plt.xlabel("charges")
plt.plot()

- bimodal distribution found, hence it will be not wise to fit a signal regression line

In [None]:
scatter_plot_is_smoker =sns.scatterplot(x="bmi", y="charges",
              hue="sex",
              data=data)

- no clear linearity found

In [None]:
scatter_plot_is_smoker =sns.scatterplot(x="bmi", y="charges",
              hue="age",
              data=data)

- no clear linearity found

In [None]:
scatter_plot_is_smoker =sns.scatterplot(x="bmi", y="charges",
              hue="region",
              data=data)

- no clear linearity found

In [None]:
scatter_plot_is_smoker =sns.scatterplot(x="bmi", y="charges",
              hue="children",
              data=data)

- no clear linearity found

## Moving ahead a fitting the regression for the smoking = yes 

In [None]:
data_smoking_true = data.loc[data['smoker'] == 'yes']
mod_smoking = sm.OLS(data_smoking_true.charges,data_smoking_true.bmi)
res_smoking = mod_smoking.fit()
print(res_smoking.summary())

- if you are a smoker and your BMI goes up by one unit, you would except a increase of $1061.07 in insurance charges

In [None]:
data_bmi_range = data.loc[(data['bmi'] < 52.58) & (data['smoker'] == 'yes') ]
mod_bmi = sm.OLS(data_bmi_range.charges,data_bmi_range.bmi)
res_bmi = mod_bmi.fit()
print(res_bmi.summary())

- removing outliers from the bmi column and including only the data points where smoker = yes
- the bmi coef = 1063.30 (slop) slight change from the previous fit and hence we could safely conclude that
  the outlier was infuencial and r square = 0.954 remains same