In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'medicalcosts'

In [2]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
insurance_df = pd.read_sql_query('select * from medicalcosts',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()


insurance_df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.9
1,18,male,33.77,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.705,0,no,northwest,21984.5
4,32,male,28.88,0,no,northwest,3866.86
5,31,female,25.74,0,no,southeast,3756.62
6,46,female,33.44,1,no,southeast,8240.59
7,37,female,27.74,3,no,northwest,7281.51
8,37,male,29.83,2,no,northeast,6406.41
9,60,female,25.84,0,no,northwest,28923.1


In [3]:
insurance_df["is_male"] = pd.get_dummies(insurance_df.sex, drop_first=True)
insurance_df["is_smoker"] = pd.get_dummies(insurance_df.smoker, drop_first=True)

# OLS Model

In [7]:
Y = insurance_df.charges
X = insurance_df[['age', 'bmi', 'children', 'is_male', 'is_smoker']]

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

print('The number of observations in training set is {}'.format(X_train.shape[0]))
print('The number of observations in test set is {}'.format(X_test.shape[0]))

The number of observations in training set is 1070
The number of observations in test set is 268


In [11]:
import statsmodels.api as sm

X_train = sm.add_constant(X_train)

lrm = sm.OLS(y_train, X_train).fit()
lrm.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.747
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,629.9
Date:,"Mon, 09 Sep 2019",Prob (F-statistic):,6.39e-315
Time:,23:17:00,Log-Likelihood:,-10848.0
No. Observations:,1070,AIC:,21710.0
Df Residuals:,1064,BIC:,21740.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.206e+04,1080.784,-11.161,0.000,-1.42e+04,-9941.509
age,253.4693,13.432,18.871,0.000,227.114,279.825
bmi,333.2182,31.136,10.702,0.000,272.123,394.313
children,498.5781,157.344,3.169,0.002,189.839,807.318
is_male,-441.2898,376.955,-1.171,0.242,-1180.949,298.370
is_smoker,2.381e+04,465.272,51.172,0.000,2.29e+04,2.47e+04

0,1,2,3
Omnibus:,255.026,Durbin-Watson:,1.933
Prob(Omnibus):,0.0,Jarque-Bera (JB):,629.6
Skew:,1.259,Prob(JB):,1.92e-137
Kurtosis:,5.789,Cond. No.,296.0


In [94]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

score = cross_val_score(LinearRegression(), X, Y, scoring='r2', cv = 5)
print(score)
print("OLS Model Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

[0.76097878 0.70873346 0.77558408 0.73136669 0.7565446 ]
OLS Model Accuracy: 0.75 (+/- 0.05)


# KNN Model

In [93]:
from sklearn import neighbors

knn = neighbors.KNeighborsRegressor(n_neighbors=10)
knn.fit(X, Y)

score_knn = cross_val_score(knn, X, Y, cv=5)
print(score_knn)
print("KNN Unweighted Model Accuracy: %0.2f (+/- %0.2f)" % (score_knn.mean(), score_knn.std() * 2))

[0.17751186 0.15596507 0.11245722 0.18089199 0.15687323]
KNN Unweighted Model Accuracy: 0.16 (+/- 0.05)


# KNN Model with Weighting

In [95]:
knn_w = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
knn_w.fit(X, Y)
score_w = cross_val_score(knn_w, X, Y, cv=5)
print(score_w)
print("KNN Weighted Model Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

[0.20443193 0.17744611 0.14505675 0.21982911 0.20184872]
KNN Weighted Model Accuracy: 0.19 (+/- 0.05)


The difference is obvoius in favor to the OLS model with R-Square = 0.75 with small variance of +/- 0.05. The KNN Models peformed poorly with a very low accuracy. It looks like models with many features are perofmring better with OLS models compare to KNN Models.

Below, OLS and KNNmodels will be built using only 2 features:

# OLS Model 2

In [88]:
Y2 = insurance_df.charges
X2 = insurance_df[['is_male', 'is_smoker']]

In [89]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, Y2, test_size = 0.2, random_state = 465)

print('The number of observations in training set is {}'.format(X2_train.shape[0]))
print('The number of observations in test set is {}'.format(X2_test.shape[0]))

The number of observations in training set is 1070
The number of observations in test set is 268


In [90]:
X2_train = sm.add_constant(X2_train)

lrm2 = sm.OLS(y2_train, X2_train).fit()
lrm2.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.621
Model:,OLS,Adj. R-squared:,0.62
Method:,Least Squares,F-statistic:,873.4
Date:,"Mon, 09 Sep 2019",Prob (F-statistic):,2.11e-225
Time:,23:42:53,Log-Likelihood:,-11065.0
No. Observations:,1070,AIC:,22140.0
Df Residuals:,1067,BIC:,22150.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8623.1715,343.542,25.101,0.000,7949.077,9297.266
is_male,-458.5556,460.197,-0.996,0.319,-1361.550,444.439
is_smoker,2.377e+04,569.204,41.756,0.000,2.27e+04,2.49e+04

0,1,2,3
Omnibus:,119.295,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,192.289
Skew:,0.763,Prob(JB):,1.76e-42
Kurtosis:,4.408,Cond. No.,2.97


In [91]:
score2 = cross_val_score(LinearRegression(), X2, Y2, scoring='r2', cv = 5)
print(score3)
print("OLS Model Accuracy: %0.2f (+/- %0.2f)" % (score2.mean(), score2.std() * 2))

[0.61988603 0.58400697 0.61053917 0.59336812 0.65763599]
OLS Model Accuracy: 0.62 (+/- 0.04)


# KNN Model

In [97]:
knn2 = neighbors.KNeighborsRegressor(n_neighbors=100)
knn2.fit(X2, Y2)

score_knn2 = cross_val_score(knn2, X2, Y2, cv=5)
print(score_knn2)
print("KNN Unweighted Model Accuracy: %0.2f (+/- %0.2f)" % (score_knn2.mean(), score_knn2.std() * 2))

[0.61988603 0.58400697 0.61053917 0.59336812 0.65763599]
KNN Unweighted Model Accuracy: 0.61 (+/- 0.05)


# KNN Model with Weighting

In [98]:
knn_w2 = neighbors.KNeighborsRegressor(n_neighbors=100, weights='distance')
knn_w2.fit(X2, Y2)
score_w2 = cross_val_score(knn_w2, X2, Y2, cv=5)
print(score_w2)
print("KNN Weighted Model Accuracy: %0.2f (+/- %0.2f)" % (score_w2.mean(), score_w2.std() * 2))

[0.61912922 0.58264722 0.62366812 0.59786415 0.65255064]
KNN Weighted Model Accuracy: 0.62 (+/- 0.05)


When models are built using only 2 features, the KNN models performed well and almost equal to OLS model performance