In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv(".\..\insurance.csv")
print(df.info())
print(df.dtypes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None
age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object


In [5]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [67]:
Y = df.charges
X = df.drop(columns='charges')
print(X.head())
Y.head()

   age     sex     bmi  children smoker     region
0   19  female  27.900         0    yes  southwest
1   18    male  33.770         1     no  southeast
2   28    male  33.000         3     no  southeast
3   33    male  22.705         0     no  northwest
4   32    male  28.880         0     no  northwest


0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [68]:
#One Hot Encoding. get_dummies first converts into labels and then performs one hot encoding
X_dummy = pd.get_dummies(X.loc[:,['sex','smoker','region']])
X_dummy.head()

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0


In [69]:
X.drop(labels=['sex','smoker','region'],axis=1,inplace=True)
X = pd.concat([X,X_dummy],axis=1)
X.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1,0,0,1,0,0,0,1
1,18,33.77,1,0,1,1,0,0,0,1,0
2,28,33.0,3,0,1,1,0,0,0,1,0
3,33,22.705,0,0,1,1,0,0,1,0,0
4,32,28.88,0,0,1,1,0,0,1,0,0


In [70]:
#Dropping dependent columns which resulted from one hot encoding of pd.get_dummies() to 
# avoid multicolinearity while still preserving information. e.g. smoker_yes is enough to provide information of 
# an observation.
X.drop(labels=['smoker_yes','sex_female','region_northwest'],axis = 1, inplace=True)

In [71]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=1234)

In [73]:
lm = LinearRegression(normalize=True).fit(X_train,Y_train,)
pred = lm.predict(X_test)
pred_train = lm.predict(X_train)
print("R square on training: ",lm.score(X_train,Y_train))
print("R square on test: ",lm.score(X_test,Y_test))
print("RMSE on Training: ",sqrt(mean_squared_error(Y_train,pred_train)))
print("RMSE on Testing: ",sqrt(mean_squared_error(Y_test,pred)))


R square on training:  0.7487857827731725
R square on test:  0.7569021086470606
RMSE on Training:  6080.390932250754
RMSE on Testing:  5916.984338587023
