In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [4]:
# fetch data 

insurance_data = pd.read_csv('insurance.csv')

insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.77,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.88,0,no,northwest,3866.86


In [5]:
# subset data to only include columns for smokers

subset_data = insurance_data[insurance_data['smoker'] == 'yes'].drop(columns = ['age', 'sex', 'children', 'smoker', 'region'])

subset_data.head()

len(subset_data)

Unnamed: 0,bmi,charges
0,27.9,16884.92
11,26.29,27808.73
14,42.13,39611.76
19,35.3,36837.47
23,31.92,37701.88


274

In [6]:
# This is just to show how we can do test train split like we did for excel in class
X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=74, shuffle = False)

# Mostly we will do one of the following int he course
# # X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25)
# # Pick a random_state as below and keep using the same number (example 35) to repeat the same test and train data
# X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25, random_state=42)

X_train
X_test
y_train
y_test

0      27.90
11     26.29
14     42.13
19     35.30
23     31.92
        ... 
994    20.05
1000   22.99
1001   32.70
1007   28.21
1011   20.13
Name: bmi, Length: 200, dtype: float64

1021   31.02
1022   36.08
1026   26.03
1030   23.66
1031   35.20
        ... 
1313   34.70
1314   23.66
1321   26.70
1323   40.37
1337   29.07
Name: bmi, Length: 74, dtype: float64

0      16,884.92
11     27,808.73
14     39,611.76
19     36,837.47
23     37,701.88
          ...   
994    16,420.49
1000   17,361.77
1001   34,472.84
1007   24,915.22
1011   18,767.74
Name: charges, Length: 200, dtype: float64

1021   35,595.59
1022   42,211.14
1026   16,450.89
1030   21,677.28
1031   44,423.80
          ...   
1313   36,397.58
1314   18,765.88
1321   28,101.33
1323   43,896.38
1337   29,141.36
Name: charges, Length: 74, dtype: float64

In [7]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train.array.reshape(-1, 1), y_train) # When extending to multiple features remove .array.reshape(-1, 1)

# The following gives the R-square score
model.score(X_train.array.reshape(-1, 1), y_train) # When extending to multiple features remove .array.reshape(-1, 1)

# This is the coefficient Beta_1 (or slope of the Simple Linear Regression line)
model.coef_

# This is the coefficient Beta_0
model.intercept_

0.6893147423420581

array([1504.39646168])

-14103.755405395561

In [8]:
test_output = pd.DataFrame(model.predict(X_test.array.reshape(-1, 1)), index = X_test.index, columns = ['pred_charges'])
# When extending to multiple features remove .array.reshape(-1, 1)
test_output.head()

Unnamed: 0,pred_charges
1021,32562.62
1022,40174.87
1026,25055.68
1030,21482.74
1031,38851.0


In [9]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_charges'] - test_output['charges']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

Unnamed: 0,pred_charges,charges
1021,32562.62,35595.59
1022,40174.87,42211.14
1026,25055.68,16450.89
1030,21482.74,21677.28
1031,38851.0,44423.8


Mean absolute error is 
5556.488164099252


#### Visualize data

In [10]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [11]:
plot_data = []
plot_data.append(go.Scatter(x= X_train, y= y_train, name = 'Train data actual', mode = 'markers'))
plot_data.append(go.Scatter(x= X_train, y= model.predict(X_train.array.reshape(-1, 1)), name = 'Train data predicted', mode = 'markers'))
plot_data.append(go.Scatter(x= X_test, y= y_test, name = 'Test data actual', mode = 'markers'))
plot_data.append(go.Scatter(x= X_test, y= model.predict(X_test.array.reshape(-1, 1)), name = 'Test data predicted', mode = 'markers'))
# When extending to multiple features remove .array.reshape(-1, 1) in above (but remember 2-d we cannot draw)

layout = go.Layout(xaxis = dict(title='bmi'), yaxis = dict(title= 'charges'), 
                   title = 'Plot of predicted and actual')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)
