In [1]:
import pandas as pd
import numpy as np

In [2]:
df =pd.read_csv('data/insurance.csv')
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [3]:
df.sex.unique()

array(['female', 'male'], dtype=object)

In [4]:
all_region=['southwest', 'southeast', 'northwest', 'northeast']
sex=['male','female']
Smoker=['yes','no']

In [5]:
numerical_columns = df.columns[df.dtypes!= 'object']
categorical_columns = df.columns[df.dtypes== 'object']
print("numerical_columns = ", numerical_columns)
print('categorical_columns = ',categorical_columns)

numerical_columns =  Index(['age', 'bmi', 'children', 'expenses'], dtype='object')
categorical_columns =  Index(['sex', 'smoker', 'region'], dtype='object')


In [6]:
from sklearn.impute import SimpleImputer #handle missing value
from sklearn.preprocessing import StandardScaler #features scaling
from sklearn.preprocessing import OrdinalEncoder #Ordinal Encoding

## Pipeline
from sklearn.pipeline import Pipeline ##combining multiple steps
from sklearn.compose import ColumnTransformer ## used to merge categorical and numerical data

In [11]:
# Numerical Pilepile
Pipe_num=Pipeline(
    steps=[
        ('inputer',SimpleImputer(strategy='medium')),
        ('scaler',StandardScaler())
    ]
)

#Categorical Pipeline
Pipe_cat=Pipeline(
    steps=[
        ('inputer',SimpleImputer(strategy='most_frequent')),
        ('odinalencoder',OrdinalEncoder(categories=[all_region,sex,Smoker])),
        ('scalar',StandardScaler())
    ]
)

preprecessor=ColumnTransformer(
    [
    ('num_pipeline',Pipe_num,numerical_columns),
    ('cat_pipeline',Pipe_cat,categorical_columns)
])

In [12]:
preprecessor

In [13]:
Pipe_num

In [9]:
isSmoker=(df['smoker']=='yes')
notsmoker=(df['smoker']=='no')

In [10]:
df.loc[isSmoker,'smoker']=1
df.loc[notsmoker,'smoker']=0

In [11]:
SEregion=(df['region']=='southeast')
SWregion=(df['region']=='southwest')
NWregion=(df['region']=='northwest')
NEregion=(df['region']=='northeast')

In [12]:
df.loc[SEregion,'region']=1
df.loc[SWregion,'region']=2
df.loc[NWregion,'region']=3
df.loc[NEregion,'region']=4

In [13]:
men=(df['sex']=='male')
women=(df['sex']=='female')

In [14]:
df.loc[men,'sex']=1
df.loc[women,'sex']=0

In [15]:
convert_dict={'sex':float,'smoker':float,'region':float}
df=df.astype(convert_dict)

In [16]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0.0,27.9,0,1.0,2.0,16884.92
1,18,1.0,33.8,1,0.0,1.0,1725.55
2,28,1.0,33.0,3,0.0,1.0,4449.46
3,33,1.0,22.7,0,0.0,3.0,21984.47
4,32,1.0,28.9,0,0.0,3.0,3866.86


In [17]:
# Independend features 
X=df.drop(['expenses'],axis=1)
Y=df['expenses']

In [18]:
# Dividing the data into train and Test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [21]:
from sklearn.metrics import r2_score, mean_squared_error
import math
from sklearn.model_selection import cross_val_predict  

In [20]:
def model_summary(model, model_name, cvn=20): # Default value for cvn = 20
    print(model_name)
    y_pred_model_train = model.predict(x_train)
    y_pred_model_test = model.predict(x_test)
    R2Score_model_train = r2_score(y_train, y_pred_model_train)
    print("Training R2 Score: ", R2Score_model_train)
    R2Score_model_test = r2_score(y_test, y_pred_model_test)
    print("Testing R2 Score: ",  R2Score_model_test)
    RMSE_model_train = math.sqrt(mean_squared_error(y_train, y_pred_model_train))
    print("RMSE for Training Data: ", RMSE_model_train)
    RMSE_model_test = math.sqrt(mean_squared_error(y_test, y_pred_model_test))
    print("RMSE for Testing Data: ", RMSE_model_test)
    y_pred_cv_model = cross_val_predict(model, X, Y, cv=cvn)
    accuracy_cv_model = r2_score(Y, y_pred_cv_model)
    print("Accuracy for", cvn,"- Fold Cross Predicted: ", accuracy_cv_model)

### Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()

In [23]:
regressor.fit(x_train,y_train)

In [24]:
model_summary(regressor, "Multiple_linear_Regression")

Multiple_linear_Regression
Training R2 Score:  0.7415963559998441
Testing R2 Score:  0.7839067775185786
RMSE for Training Data:  6107.072436225262
RMSE for Testing Data:  5792.080727789472
Accuracy for 20 - Fold Cross Predicted:  0.7472738794666414


### Decision Tree

In [25]:
from sklearn.tree import DecisionTreeRegressor

In [26]:
decision_tree_reg = DecisionTreeRegressor(max_depth=5, random_state=13)  
decision_tree_reg.fit(x_train, y_train) 
model_summary(decision_tree_reg, "Decision_Tree_Regression")

Decision_Tree_Regression
Training R2 Score:  0.8793891095338366
Testing R2 Score:  0.8311614173587286
RMSE for Training Data:  4172.314299103488
RMSE for Testing Data:  5119.763291336907
Accuracy for 20 - Fold Cross Predicted:  0.8498632643795985


### Random Forest Classifier

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [28]:
random_forest_reg=RandomForestRegressor()
random_forest_reg.fit(x_train,y_train)
model_summary(random_forest_reg,"Random_Forest_Regressor")

Random_Forest_Regressor
Training R2 Score:  0.9756601306058993
Testing R2 Score:  0.8630344061161902
RMSE for Training Data:  1874.3162742501434
RMSE for Testing Data:  4611.2617595002885
Accuracy for 20 - Fold Cross Predicted:  0.8357709319139357


### XGboost

In [32]:
import xgboost as xgb

In [33]:
xgb_r = xgb.XGBRegressor(objective ='reg:linear',n_estimators = 10, seed = 123,verbosity=0)
  
# Fitting the model
xgb_r.fit(x_train, y_train)
  
# Predict the model
pred = xgb_r.predict(x_test)

In [34]:
model_summary(xgb_r,"Xg_boost")

Xg_boost
Training R2 Score:  0.9144815870355608
Testing R2 Score:  0.8705620299518245
RMSE for Training Data:  3513.2865305995406
RMSE for Testing Data:  4482.753723569274
Accuracy for 20 - Fold Cross Predicted:  0.8477346692178661


### Conclusion : We got R2 Score of 88.64% and Root mean square error of 4251 for test data