In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Collection

In [None]:
ins = pd.read_csv("../input/insurance/insurance.csv")
ins

In [None]:
ins.info() # NO missing values

## Missing Values ??

In [None]:
ins.isnull().sum()# NO missing values

## Descriptive Statistics

In [None]:
ins.describe()

## Exploratory Data Analysis

In [None]:
# EDA
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline

#fig = plt.figure(figsize = (10,10))
scatter_matrix(ins[["age", "bmi","children", "charges"]], figsize = (10,10))
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Checking correlation coeff using a heatmap
import seaborn as sns
ins_corr = ins.corr()
plt.figure(figsize = (8,6))
sns.heatmap(ins_corr, annot = True, cmap = "YlGnBu")
plt.title("Correlation Matrix", fontsize = 14)
plt.show()

In [None]:
sns.barplot(data=ins, x = "smoker", y = "charges")
plt.title("Charges by Smoker", fontsize = 14)
plt.show()

In [None]:
grp_by_smoker = ins.groupby("smoker")
avg_charge_by_smoker = grp_by_smoker["charges"].mean()
(avg_charge_by_smoker["yes"] - avg_charge_by_smoker["no"])/avg_charge_by_smoker["no"]*100
#The charges vary depending whether or not one is smoker.


In [None]:
avg_charge_by_smoker

In [None]:
grp_by_age = ins.groupby([pd.cut(ins["age"], bins = 5), "smoker"])
mean_charges_by_age = grp_by_age["charges"].mean().reset_index()
mean_charges_by_age.age = mean_charges_by_age.age.astype("str")
mean_charges_by_age.groupby("age")["charges"].sum()

mean_charges_by_age["total_charges_by_age"] = mean_charges_by_age.groupby("age").transform(np.sum)["charges"]


In [None]:
mean_charges_by_age_smoke = mean_charges_by_age.pivot(index = 'age', columns = "smoker", values = "charges").copy()
mean_charges_by_age_smoke

In [None]:
mean_charges_by_age_smoke["diff_smoker_non_smoker"] = (mean_charges_by_age_smoke["yes"] - mean_charges_by_age_smoke["no"])
mean_charges_by_age_smoke["pct_over_non_smoker"] = (mean_charges_by_age_smoke["yes"] - mean_charges_by_age_smoke["no"])/mean_charges_by_age_smoke["no"]*100
mean_charges_by_age_smoke

In [None]:
sns.barplot(x = 'age', y = 'charges', data = mean_charges_by_age, hue = "smoker")
plt.title("Distribution Charges by Age group and Smoker", fontsize = 14)


In [None]:
mean_charges_by_age.pivot(index = "age", columns = "smoker", values = "charges").plot()
plt.title("Variation of charges by age and smoking habit", fontsize = 14)
plt.show()

There are 2 interesting observations
1. The charges keep increasinig with age
2. For an age group, the smoker pays more than the non smoker

In [None]:
# Does region have an influence on the charges
plt.bar(x = ins.region, height = ins.charges)

In [None]:
mean_charge_by_region = ins.groupby("region").mean().reset_index()
mean_charge_by_region

In [None]:
grp_by_region_smoker = ins.groupby(["region", "smoker"])
mean_charge_by_region_smoker = grp_by_region_smoker["charges"].mean().reset_index()
mean_charge_by_region_smoker

In [None]:
sns.barplot(data = mean_charge_by_region_smoker, x = "region", y = "charges", hue = "smoker")
plt.show()

In [None]:
mean_charge_by_region_smoker.pivot(index = "region", columns = "smoker", values = "charges").plot()

In [None]:
mean_charge_by_region_smoker.groupby("region").describe()

A non-smoker pays lesser charges in South East region. 

Also, a smoker pays higher charges in southeast region.For a smoker, the northeast region will incur low charges. For a non-smoker, south east region will incur the low charges

In [None]:
# PREDICTION
# ins.groupby(["sex",])["charges"].mean() # almost the same 

grp_by_age_sex = ins.groupby([pd.cut(ins["age"], bins = 5), "sex", "smoker"])
grp_by_age_sex["charges"].mean().reset_index()

In [None]:
ins.info()

In [None]:
num_attribs = ["age", "bmi", "children"]
cat_attribs = ["sex","smoker", "region"]

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
encoded_cat = ohe.fit_transform(ins[cat_attribs])
encoded_cat.toarray()
ohe.categories_

In [None]:
def OHE_attribs(df, cat_attribs):
    ohe = OneHotEncoder()
    encoded_cols = ohe.fit_transform(df[cat_attribs])
    cat_columns = []
    for i in range(len(cat_attribs)):
        for j in ohe.categories_[i]:
            cat_columns.append(cat_attribs[i]+'_'+str(j))
    return cat_columns,encoded_cols.toarray()
    
cat_cols,encoded_cols  = OHE_attribs(ins,cat_attribs )
cat_cols, encoded_cols

In [None]:
ins_df = ins[num_attribs].merge(pd.DataFrame(encoded_cols, columns = cat_cols), how = "left", right_index = True, left_index = True)
ins_df

In [None]:
X = ins_df.values
y = ins["charges"].values
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, test_size = 0.2)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.metrics import mean_squared_error
def accuracy_check(y_pred, y_test):
    mse = mean_squared_error(y_pred, y_test)
    return round(np.sqrt(mse),2)

In [None]:
# LInear_Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_train = lin_reg.predict(X_train)
training_acc = accuracy_check(y_pred_train, y_train)
training_acc

In [None]:
y_pred_test = lin_reg.predict(X_test)
test_acc = accuracy_check(y_pred_test, y_test)
test_acc

In [None]:
# Cross Validation using Linear Regression
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin_reg,X_train, y_train, scoring = "neg_mean_squared_error", cv = 10)
lin_reg_score = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", np.round(scores,2))
    print("Mean :", np.round(scores.mean(),2))
    print("Std. :", np.round(scores.std(),2))

display_scores(lin_reg_score)

In [None]:
# Testing with Decison Tree
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor()

scores = cross_val_score(dtree, X_train, y_train, scoring = "neg_mean_squared_error", cv =10)
dtree_scores = np.sqrt(-scores)
display_scores(dtree_scores)


In [None]:
# Random Forest 
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
scores = cross_val_score(forest_reg, X_train, y_train, scoring = "neg_mean_squared_error", cv = 10)
forest_reg_scores = np.sqrt(-scores)

display_scores(forest_reg_scores) # Best performer

In [None]:
# Let us see the testing scores
y_pred = lin_reg.predict(X_test)
accuracy_check(y_pred, y_test)

In [None]:
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
accuracy_check(y_pred, y_test)

In [None]:
forest_reg.fit(X_train, y_train)
y_pred = forest_reg.predict(X_test)
accuracy_check(y_pred, y_test)

In [None]:
# Tuning the model with GridSearchCV
from sklearn.model_selection import GridSearchCV

params = [
    {'n_estimators':[3,10,30], 'max_features':[2,4,6,8,10]},
    {'bootstrap': [False], 'n_estimators':[5,15], 'max_features': [3,5]}
]
grid_search = GridSearchCV(forest_reg, params, cv = 5, scoring = "neg_mean_squared_error", return_train_score=True)

grid_search.fit(X_train, y_train)


In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
for (mean_test_scores, params) in zip(grid_search.cv_results_["mean_test_score"], grid_search.cv_results_["params"]):
    print(params, np.sqrt(-mean_test_scores))

In [None]:
final_model = grid_search.best_estimator_

y_pred_final = final_model.predict(X_test)
accuracy_check(y_pred_final, y_test)

In [None]:
from scipy import stats
confidence = 0.95
sq_error = (y_pred_final - y_test)**2

np.sqrt(stats.t.interval(confidence, len(sq_error)-1, sq_error.mean(), stats.sem(sq_error)))

This is the range estimate of the error in the population with 95% confidence. So