In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt # Visualization
import seaborn as sns #Visualization

In [None]:
insurance = pd.read_csv('/kaggle/input/insurance/insurance.csv')
insurance.head()

We can check for missing values in the dataset. None missing here.

In [None]:
plt.figure(figsize=(12,4))
sns.heatmap(insurance.isnull(),cbar=False,cmap='viridis',yticklabels=False)
plt.title('Missing value in the dataset');

We can check whether certain attribute correlate with others.

In [None]:
# correlation plot
corr = insurance.corr()
sns.heatmap(corr, cmap = 'Wistia', annot= True);

We encode categorical attributes by adding a separate column for each of their possible values.

In [None]:
categorical_columns = ['sex','children', 'smoker', 'region']
insurance_encoded = pd.get_dummies(data = insurance,
               columns = categorical_columns,
              dtype='int8')
# Lets verify the dummy variable process
print('Columns in original data frame:\n',insurance.columns.values)
print('\nNumber of rows and columns in the dataset:',insurance.shape)
print('\nColumns in data frame after encoding dummy variable:\n',insurance_encoded.columns.values)
print('\nNumber of rows and columns in the dataset:',insurance_encoded.shape)
insurance_encoded.head()

We build the set of examples by separating input attributes from the output one (charges).
Next, we employ the test-set cross-validation method by setting apart a set of test examples for assesing performance.

In [None]:
from sklearn.model_selection import train_test_split
X = insurance_encoded.drop('charges',axis=1) # independent variables
y = insurance_encoded['charges'] # dependent variable

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=23)

We train the model using linear regression.

In [None]:
# Scikit Learn module
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train) # Note: x_0 =1 is no need to add, sklearn will take care of it.
R_square_sk = lin_reg.score(X_test,y_test)
print('R square error is :',R_square_sk)

We also check how good our technique is by 10-fold cross-validation.

In [None]:

from sklearn.model_selection import cross_val_score

def cross_val(classifier):
    scores = cross_val_score(classifier, X, y,cv=10)
    print("Cross validation scores: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    
cross_val(lin_reg)


We can retrieve the linear coeficients and the intercept and discuss about how each variable affects the prediction.

In [None]:
columns = ['intersect:x_0=1'] + list(X.columns.values)
theta = [lin_reg.intercept_]+list(lin_reg.coef_)
parameters = pd.DataFrame({'Columns':columns, 'theta': theta})
parameters

Check for linearity. The closer the plot is to a diagonal line, the better the prediction.

In [None]:
test_pred = lin_reg.predict(X_test)
# Check for Linearity
f = plt.figure(figsize=(14,5))
ax = f.add_subplot(121)
sns.scatterplot(x=y_test,y=test_pred,ax=ax,color='r')
ax.set_title('Check for Linearity:\n Actual Vs Predicted value')


# Using other linear regression models

## Lasso

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1, 
              random_state=42)
r_model = lasso.fit(X_train, y_train)

print("Score on the train data:", lasso.score(X_train, y_train))
print("Score on the test data: ", lasso.score(X_test, y_test))

columns = ['intersect:x_0=1'] + list(X.columns.values)
theta = [lasso.intercept_]+list(lasso.coef_)
parameters = pd.DataFrame({'Columns':columns, 'theta': theta})
parameters

## Elastic Net

In [None]:
from sklearn.linear_model import ElasticNet
en = ElasticNet(alpha=0.1, l1_ratio=0.9, random_state=42)
r_model = en.fit(X_train, y_train)

print("Score on the train data:", en.score(X_train, y_train))
print("Score on the test data: ", en.score(X_test, y_test))

columns = ['intersect:x_0=1'] + list(X.columns.values)
theta = [en.intercept_]+list(en.coef_)
parameters = pd.DataFrame({'Columns':columns, 'theta': theta})
parameters

# Regression with Decision Trees

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(max_depth=2)
model = dtr.fit(X_train, y_train)

print("Score on the train data:", dtr.score(X_train, y_train))
print("Score on the test data: ", dtr.score(X_test, y_test))


In [None]:
from sklearn import tree
import matplotlib.pyplot as plt # data visualization

plt.figure(figsize=(40,20))

tree.plot_tree(model, feature_names = X.columns, label='root') 

In [None]:
import matplotlib.pyplot as plt # data visualization

plt.figure()
plt.title("Feature importances")
plt.barh(X.columns, model.feature_importances_, 1)

# K-fold cross-validation for choosing parameters

Estimating the `alpha` parameter for the `Lasso` algorithm

In [None]:
for i in np.arange(0.1,1.0, 0.1):
    print("Lasso with alpha={0:.2f}:".format(i), end=" ")
    lasso = Lasso(alpha=i)
    cross_val(lasso)

We see that the choice of alpha does not really matter.

Estimating `alpha` and `l1_ratio` params for `ElasticNet`

In [None]:
print("    ", end=" ")
for r in np.arange(0.1, 1.0, 0.1):
    print("{0:.2f}".format(r), end=" ")

for a in np.arange(0.1, 1.0, 0.1):
    print()
    print("{0:.2f}".format(a), end=" ")
    for r in np.arange(0.1, 1.0, 0.1):    
        en = ElasticNet(alpha=a, l1_ratio=r, random_state=42)
        print("{0:.2f}".format(cross_val_score(en, X, y).mean()), end=" ")
    

It seems that `alpha=0.1` and `l1_ratio=0.9` provide best results.

Estimating depth for the DecisionTree

In [None]:
for i in range(1, 10):
    print("DecisionTree with depth={0:d}:".format(i), end=" ")
    dt = DecisionTreeRegressor(max_depth=i)
    cross_val(dt)

Seems that a `max_depth` of 3, 4, or 5 works