In [38]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.5f' % x)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from math import sqrt 
from sklearn.model_selection import cross_val_predict  
from sklearn.metrics import r2_score, mean_squared_error  

In [889]:
data = pd.read_csv("insurance.csv")

sex_map = {"female" : 0, "male" : 1}
smoker_map = {"yes" : 0, "no" : 1}
data['sex'] = data.sex.map(sex_map)
data['smoker'] = data.smoker.map(smoker_map)

data_type = data.dtypes.reset_index()
data_type.columns = ["column_name","column_type"]
data_type

Unnamed: 0,column_name,column_type
0,age,int64
1,sex,int64
2,bmi,float64
3,children,int64
4,smoker,int64
5,region,object
6,expenses,float64


In [890]:
def preprocess_my_data(data,scale = False, polyfeature = False):
    
    data_copy = data.copy()
    
    print(data_copy.dtypes)
    temp_data_type = data.dtypes.reset_index()
    temp_data_type.columns = ["column_name","column_type"]
    
    if "object" in temp_data_type.column_type.tolist():
        data_copy = pd.get_dummies(data_copy, columns = data_type[data_type.column_type == "object"].column_name.tolist())

    data_x = data_copy.drop(["expenses"],axis = 1)
    data_y = data_copy["expenses"]
    
    train_x, test_x, train_y, test_y = train_test_split(data_x,data_y,shuffle=True, random_state=0)
    
    train_x_copy = train_x.copy()
    if scale == True:
        scaler = StandardScaler()
        scaler.fit(train_x_copy.values)
        train_x = pd.DataFrame(scaler.transform(train_x), columns=train_x.columns)
        test_x = pd.DataFrame(scaler.transform(test_x), columns=test_x.columns)

    if polyfeature == True:
        polynomial_features = PolynomialFeatures(degree=3)  
        train_x = pd.DataFrame(polynomial_features.fit_transform(train_x))
        test_x = pd.DataFrame(polynomial_features.fit_transform(test_x))
        
    data_x = pd.concat([train_x, test_x], axis=0)
    data_y = pd.concat([train_y, test_y], axis=0)
    
    print(data_x.columns)
    
    return train_x,test_x, train_y, test_y, data_x, data_y

In [891]:
train_x,test_x, train_y, test_y, data_x, data_y = preprocess_my_data(data,scale = True, polyfeature = True)
train_x.shape,test_x.shape, train_y.shape, test_y.shape, data_x.shape, data_y.shape

age           int64
sex           int64
bmi         float64
children      int64
smoker        int64
region       object
expenses    float64
dtype: object
RangeIndex(start=0, stop=220, step=1)


((1003, 220), (335, 220), (1003,), (335,), (1338, 220), (1338,))

In [892]:
train_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,210,211,212,213,214,215,216,217,218,219
0,1.00000,-0.51485,-0.98516,-0.18005,-0.06361,0.50374,-0.54240,-0.55777,1.62298,-0.59309,...,-0.17353,0.50493,-0.18452,-1.46921,0.53689,-0.19620,4.27502,-1.56223,0.57089,-0.20862
1,1.00000,1.54875,-0.98516,-1.39983,-0.89214,0.50374,1.84367,-0.55777,-0.61615,-0.59309,...,-0.17353,-0.19169,-0.18452,-0.21175,-0.20383,-0.19620,-0.23392,-0.22516,-0.21673,-0.20862
2,1.00000,-1.43992,1.01507,-0.98254,-0.06361,0.50374,-0.54240,-0.55777,-0.61615,1.68609,...,-0.17353,-0.19169,0.52456,-0.21175,0.57946,-1.58570,-0.23392,0.64011,-1.75166,4.79342
3,1.00000,-1.36876,-0.98516,-1.01464,-0.89214,-1.98517,-0.54240,-0.55777,1.62298,-0.59309,...,-0.17353,0.50493,-0.18452,-1.46921,0.53689,-0.19620,4.27502,-1.56223,0.57089,-0.20862
4,1.00000,-0.94181,-0.98516,-1.36773,-0.89214,0.50374,-0.54240,1.79284,-0.61615,-0.59309,...,5.76271,-1.98049,-1.90635,0.68064,0.65516,0.63064,-0.23392,-0.22516,-0.21673,-0.20862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,1.00000,-0.87065,1.01507,-0.75784,-0.89214,0.50374,1.84367,-0.55777,-0.61615,-0.59309,...,-0.17353,-0.19169,-0.18452,-0.21175,-0.20383,-0.19620,-0.23392,-0.22516,-0.21673,-0.20862
999,1.00000,0.19673,1.01507,0.84713,0.76493,0.50374,-0.54240,-0.55777,1.62298,-0.59309,...,-0.17353,0.50493,-0.18452,-1.46921,0.53689,-0.19620,4.27502,-1.56223,0.57089,-0.20862
1000,1.00000,0.05442,1.01507,-0.90229,-0.89214,0.50374,-0.54240,-0.55777,1.62298,-0.59309,...,-0.17353,0.50493,-0.18452,-1.46921,0.53689,-0.19620,4.27502,-1.56223,0.57089,-0.20862
1001,1.00000,-1.43992,1.01507,0.76688,-0.89214,0.50374,-0.54240,1.79284,-0.61615,-0.59309,...,5.76271,-1.98049,-1.90635,0.68064,0.65516,0.63064,-0.23392,-0.22516,-0.21673,-0.20862


In [893]:
models = {
    "                     Linear Regression": LinearRegression(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}

In [894]:
def model_summary(model, model_name, cvn=20):
    
    print(model_name)
    
    y_pred_model_train = model.predict(train_x)
    y_pred_model_test = model.predict(test_x)
    
    accuracy_model_train = r2_score(train_y, y_pred_model_train)
    print("Training Accuracy: ", accuracy_model_train)
    
    accuracy_model_test = r2_score(test_y, y_pred_model_test)
    print("Testing Accuracy: ", accuracy_model_test)
    
    RMSE_model_train = sqrt(mean_squared_error(train_y, y_pred_model_train))
    print("RMSE for Training Data: ", RMSE_model_train)
    
    RMSE_model_test = sqrt(mean_squared_error(test_y, y_pred_model_test))
    print("RMSE for Testing Data: ", RMSE_model_test)
    
    y_pred_cv_model = cross_val_predict(model, data_x, data_y, cv=cvn)
    accuracy_cv_model = r2_score(data_y, y_pred_cv_model)
    print("Accuracy for", cvn,"- Fold Cross Predicted: ", accuracy_cv_model)

In [895]:
for name, model in models.items():
    model.fit(train_x, train_y)
    print(name + " trained.")

                     Linear Regression trained.
                   K-Nearest Neighbors trained.




                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


In [896]:
for name, model in models.items():
    model_summary(model, name)

                     Linear Regression
Training Accuracy:  0.8432726770198031
Testing Accuracy:  0.8623318488726837
RMSE for Training Data:  4732.361579904418
RMSE for Testing Data:  4655.504094371313
Accuracy for 20 - Fold Cross Predicted:  0.8272064069440559
                   K-Nearest Neighbors
Training Accuracy:  0.8443574653320628
Testing Accuracy:  0.8335135385089941
RMSE for Training Data:  4715.955619597897
RMSE for Testing Data:  5119.640127937328
Accuracy for 20 - Fold Cross Predicted:  0.7890304329898132
                        Neural Network
Training Accuracy:  -0.2824947697815319
Testing Accuracy:  -0.25108028513013614
RMSE for Training Data:  13537.3408328235
RMSE for Testing Data:  14034.358404165494




Accuracy for 20 - Fold Cross Predicted:  0.19877669515625684
Support Vector Machine (Linear Kernel)
Training Accuracy:  0.5835310602328199
Testing Accuracy:  0.6308119404808208
RMSE for Training Data:  7714.306157320669
RMSE for Testing Data:  7623.840401975987
Accuracy for 20 - Fold Cross Predicted:  0.6475386144539015
   Support Vector Machine (RBF Kernel)
Training Accuracy:  -0.09308208648872829
Testing Accuracy:  -0.09471123023235006
RMSE for Training Data:  12497.753385427415
RMSE for Testing Data:  13128.036020672389
Accuracy for 20 - Fold Cross Predicted:  -0.10080778579105854
                         Decision Tree
Training Accuracy:  0.9994547913426406
Testing Accuracy:  0.7535835921689917
RMSE for Training Data:  279.11741463181363
RMSE for Testing Data:  6228.518864184397
Accuracy for 20 - Fold Cross Predicted:  0.6871260921990181
                         Random Forest
Training Accuracy:  0.9745244281579322
Testing Accuracy:  0.8737126816821208
RMSE for Training Data:  1907.9

In [897]:
data.drop(['region'],axis = 1,inplace = True)

In [884]:
train_x,test_x, train_y, test_y, data_x, data_y = preprocess_my_data(data,scale = True, polyfeature = True)
train_x.shape,test_x.shape, train_y.shape, test_y.shape, data_x.shape, data_y.shape

age           int64
sex           int64
bmi         float64
children      int64
smoker        int64
expenses    float64
dtype: object
RangeIndex(start=0, stop=56, step=1)


((1003, 56), (335, 56), (1003,), (335,), (1338, 56), (1338,))

In [885]:
train_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
0,1.00000,-0.51485,-0.98516,-0.18005,-0.06361,0.50374,0.26507,0.50721,0.09270,0.03275,...,-0.00584,-0.00206,0.01633,-0.00073,0.00577,-0.04569,-0.00026,0.00204,-0.01614,0.12782
1,1.00000,1.54875,-0.98516,-1.39983,-0.89214,0.50374,2.39861,-1.52576,-2.16798,-1.38170,...,-2.74301,-1.74818,0.98709,-1.11416,0.62909,-0.35521,-0.71008,0.40093,-0.22638,0.12782
2,1.00000,-1.43992,1.01507,-0.98254,-0.06361,0.50374,2.07336,-1.46161,1.41477,0.09159,...,-0.94852,-0.06140,0.48630,-0.00398,0.03148,-0.24932,-0.00026,0.00204,-0.01614,0.12782
3,1.00000,-1.36876,-0.98516,-1.01464,-0.89214,-1.98517,1.87350,1.34844,1.38879,1.22113,...,-1.04456,-0.91845,-2.04371,-0.80757,-1.79698,-3.99857,-0.71008,-1.58004,-3.51584,-7.82332
4,1.00000,-0.94181,-0.98516,-1.36773,-0.89214,0.50374,0.88700,0.92782,1.28814,0.84023,...,-2.55860,-1.66893,0.94233,-1.08861,0.61467,-0.34706,-0.71008,0.40093,-0.22638,0.12782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,1.00000,-0.87065,1.01507,-0.75784,-0.89214,0.50374,0.75803,-0.88377,0.65981,0.77674,...,-0.43525,-0.51238,0.28931,-0.60318,0.34058,-0.19230,-0.71008,0.40093,-0.22638,0.12782
999,1.00000,0.19673,1.01507,0.84713,0.76493,0.50374,0.03870,0.19970,0.16666,0.15049,...,0.60793,0.54894,0.36150,0.49567,0.32642,0.21496,0.44758,0.29475,0.19410,0.12782
1000,1.00000,0.05442,1.01507,-0.90229,-0.89214,0.50374,0.00296,0.05524,-0.04910,-0.04855,...,-0.73458,-0.72632,0.41010,-0.71815,0.40549,-0.22896,-0.71008,0.40093,-0.22638,0.12782
1001,1.00000,-1.43992,1.01507,0.76688,-0.89214,0.50374,2.07336,-1.46161,-1.10425,1.28461,...,0.45101,-0.52468,0.29625,0.61038,-0.34464,0.19460,-0.71008,0.40093,-0.22638,0.12782


In [886]:
for name, model in models.items():
    model.fit(train_x, train_y)
    print(name + " trained.")

                     Linear Regression trained.
                   K-Nearest Neighbors trained.




                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


In [887]:
for name, model in models.items():
    model_summary(model, name)

                     Linear Regression
Training Accuracy:  0.8355466992767058
Testing Accuracy:  0.8814078014426706
RMSE for Training Data:  4847.600921326647
RMSE for Testing Data:  4320.937984245219
Accuracy for 20 - Fold Cross Predicted:  0.8385819063721809
                   K-Nearest Neighbors
Training Accuracy:  0.85440972069296
Testing Accuracy:  0.8589526304010063
RMSE for Training Data:  4561.122659922318
RMSE for Testing Data:  4712.2949810263635
Accuracy for 20 - Fold Cross Predicted:  0.8128251841533437
                        Neural Network
Training Accuracy:  -0.6193301227826709
Testing Accuracy:  -0.5464413191083968
RMSE for Training Data:  15211.542648528124
RMSE for Testing Data:  15603.308509906015




Accuracy for 20 - Fold Cross Predicted:  -0.3299937935973145
Support Vector Machine (Linear Kernel)
Training Accuracy:  0.33372023384522387
Testing Accuracy:  0.3970913167078851
RMSE for Training Data:  9757.394727444922
RMSE for Testing Data:  9742.61952104642
Accuracy for 20 - Fold Cross Predicted:  0.48237299675825374
   Support Vector Machine (RBF Kernel)
Training Accuracy:  -0.09113611350490647
Testing Accuracy:  -0.09261516825444982
RMSE for Training Data:  12486.623789344103
RMSE for Testing Data:  13115.461763227693
Accuracy for 20 - Fold Cross Predicted:  -0.09593033699255282
                         Decision Tree
Training Accuracy:  0.9994530805400387
Testing Accuracy:  0.6822933978939467
RMSE for Training Data:  279.5549909455425
RMSE for Testing Data:  7072.339378299368
Accuracy for 20 - Fold Cross Predicted:  0.6907212445463851
                         Random Forest
Training Accuracy:  0.9727396372930913
Testing Accuracy:  0.8721678211968332
RMSE for Training Data:  1973.6