In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading the data

In [None]:
df_insurance = pd.read_csv('../input/insurance/insurance.csv')
df_insurance.head()

In [None]:
df_insurance.shape

In [None]:
df_insurance.describe()

In [None]:
df_insurance.info()

## looking for the missing value in dataset

In [None]:
# check for null values in dataset
df_insurance.isnull().sum()

In [None]:
# check for dupliacte row in dataset
df_insurance.duplicated().sum()

### Observation:
 1. no missing values in data 
 2. In the given dataset there is 1 duplicate row which we need to treat before procceding ahead

In [None]:
# treating duplictaes value of dataset
df_insurance.drop_duplicates(inplace = True)
df_insurance.duplicated().sum()

### Checking for outliers

In [None]:
# We found via data that having numeric column has not differ values
# To get the better understanding we can impute these columns
col = list(df_insurance.columns)
for i in col:
    if df_insurance[i].value_counts().shape[0] < 10:
        df_insurance[i] = df_insurance[i].astype(str)
df_insurance.info()

## Univariate analysis of the dataset

In [None]:
# columns in dataset
# columns in dataset
# Univariate categorical analysis
# check for the cols having categorical type
num_col = list(df_insurance._get_numeric_data().columns)
cat_col = list(set(col)- set(num_col))
cat_col

In [None]:
# create a func to plot graphs for univariate categorical analysis
def plot_cat(df,catColumns):
    fig,axes = plt.subplots(2,2, figsize = (24,12), sharey = True)
    plt.suptitle('Univariate Categorical Analysis',color ='brown',fontsize = 20,fontweight='bold')
    index = 0
    for i in range(2):
        for j in range(2):
            ax= sns.boxplot(data = df , x = catColumns[index],y = 'charges',ax = axes[i][j])
            ax.title.set_text(f'Graph for {catColumns[index]}')
            index = index+1
plot_cat(df_insurance, cat_col)

### Observations:
1. We can clearly see prices are very high for smokers

In [None]:
# create a func to plot graphs for univariate numeric analysis
sns.pairplot(df_insurance)

### Observation
1. We can see bmi has some sort of linear relationship with charges

#### Data preperation for regression


In [None]:
#For categorical columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for colName in ['sex','smoker']:
    le.fit(df_insurance[colName].drop_duplicates())
    df_insurance[colName] = le.transform(df_insurance[colName])
# for region field
regions = pd.get_dummies(df_insurance['region'])
df_insurance = pd.concat([df_insurance,regions],axis = 1 )
df_insurance.head()

In [None]:
# lets drop extra col
df_insurance.drop(columns = ['region','northeast'], axis = 1, inplace = True)
df_insurance .head()

#### Check for correlation

In [None]:
sns.heatmap(df_insurance.corr(),annot = True , cmap = 'Greens')
corr_df = df_insurance.corr()
corr_df = corr_df.where(np.triu(np.ones(corr_df.shape), k =1).astype(np.bool)).unstack().reset_index()
corr_df = corr_df.sort_values(by = 0 ,ascending = False)
corr_df = corr_df[corr_df['level_0'] == 'charges']
corr_df.dropna(inplace = True)
corr_df.head()

clearly we can see the highest correlation is smoking followed by age and bmi

#### SPLIT Data into TEST and TRAIN data

In [None]:
from sklearn.model_selection import train_test_split
df_train ,df_test = train_test_split(df_insurance,test_size = .70,random_state = 100)
print(df_train.shape)
print(df_test.shape)

### scalling of the data
1. Standardisation : normalize data to have mean and standard deviation 0 and 1 respectively
2. MinMax : normalize value between 0 to 1
#### preffered MInMax as it will make the dataset independent of outliers

In [None]:
# Check for variables scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_train[['age','bmi']]=scaler.fit_transform(df_train[['age','bmi']])
df_train.head()

### let's start model building via train data

In [None]:
y_train = df_train.pop('charges')
X_train = df_train
X_train.head()

In [None]:
# add a constant
import statsmodels.api as sm
X_train_sm = sm.add_constant(X_train)
X_train_sm.head()

In [None]:
# create a model
lr = sm.OLS(y_train,X_train_sm.astype(float))
lr_model = lr.fit()
print(lr_model.params)
print(lr_model.summary())
p_values = pd.DataFrame()
p_values['Features'] = X_train_sm.columns
p_values['Pvalue'] = [round(lr_model.pvalues[i],2) for i in X_train_sm.columns]

### find the inverse variance Factor

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def vif_func(X_train_sm):
    vif = pd.DataFrame()
    vif['Features'] = X_train_sm.columns
    vif['VIF'] = [variance_inflation_factor(X_train_sm.astype(float).values,i) for i in range(X_train_sm.astype(float).shape[1])]
    vif['VIF'] = round(vif['VIF'],2)
    vif = vif.sort_values(by = 'VIF',ascending = False)
    return vif
#call vif function
vif = vif_func(X_train_sm)

In [None]:
# concat p_value and VIF dataframe
p_vif_df = pd.DataFrame()
p_vif_df = vif.merge(p_values , how = 'inner')
# drop for constance variable
p_vif_df.drop(index = 0, axis = 0)

#### now we need to follow reverse method approch to find best fit model
1. If V factor and p value is above 0.5 drop that column
2. If Vfactor and P value same than irst drop column having high P value than see whether its affecting V factor

In [None]:
# function to improvise model
col_name = []
def model_improvise(p_vif_df,y_train,X_train_sm):
    # check for highest p value as data is already soted accross Pvalue
    for i in range(p_vif_df.shape[0]):
        if p_vif_df.loc[i,'Pvalue'] > 0.05:
            col_name.append(p_vif_df.loc[i,'Features'])
    X_train_sm.drop(columns = col_name,axis = 1 , inplace = True)
model_improvise(p_vif_df,y_train,X_train_sm)

#### check again VF

In [None]:
#call vif function
vif_func(X_train_sm)

#### Observations:
1. Models looks good with low VIF and P value less than .5

In [None]:
# build a linear model with left columns
lr = sm.OLS(y_train,X_train_sm.astype(float))
lr_model = lr.fit()
lr_model.summary()

#### Observation:
Model looks significant with P value less than .5 and R square .74 

In [None]:
y_train_pred = lr_model.predict(X_train_sm.astype(float))
# calculating residual
res_train = y_train - y_train_pred
# plot distribution of residual
sns.distplot(res_train)
# lets check efficency of model of train sample
from sklearn.metrics import r2_score
r2_score(y_true = y_train,y_pred = y_train_pred )

In [None]:
# Check for variables scaling
df_test[['age','bmi']]=scaler.transform(df_test[['age','bmi']])
df_test.head()

In [None]:
# Lets check for test data does the model holds good
y_test = df_test.pop('charges')
X_test = df_test
# adding a constant
X_test_sm = sm.add_constant(X_test)
# drop columns as in train data
X_test_sm.drop(columns = col_name , axis =1 ,inplace = True)

In [None]:
# building train model
y_test_pred = lr_model.predict(X_test_sm.astype(float))
# calculating residual
res_test = y_test - y_test_pred
# plot distribution of residual
sns.distplot(res_test)
#efficency of test model
r2_score(y_true = y_test,y_pred = y_test_pred )

In [None]:
##Comparing the actual output values with the predicted values
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
df.tail(5)

In [None]:
#plot for ytest and yPred
plt.scatter(y_test,y_test_pred)

## for both train and test data the R2_score is .74 and .75 respectively which is good sign that indicates whatever we have predict for train data holds good with test data

#### Another approach for Recurssive Feature 

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
rfe = RFE(lr,9)
rfe = rfe.fit(X_train,y_train)
list(zip(X_train.columns,rfe.support_,rfe.ranking_))