In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [2]:
# Original Raw Data
df = pd.read_excel('insurance.xlsx')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [5]:
categorical = []
continuous = []
check = []

d_types = dict(df.dtypes)
for name, type in d_types.items():
    if str(type) == 'object':
        categorical.append(name)
    elif str(type) == 'float64': 
        continuous.append(name)
    else:
        check.append(name)
        
print("Catrgorical Features ",categorical)
print("Continuous Features ",continuous) 
print("Features to check ",check) 

Catrgorical Features  ['sex', 'smoker', 'region']
Continuous Features  ['bmi', 'expenses']
Features to check  ['age', 'children']


In [6]:
d_types = dict(df.dtypes)
for name, type in d_types.items():
    if str(type) == 'object':
        print(f'<-------{name}------>')
        print(df[name].value_counts())

<-------sex------>
male      676
female    662
Name: sex, dtype: int64
<-------smoker------>
no     1064
yes     274
Name: smoker, dtype: int64
<-------region------>
southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64


In [7]:
df.corr()

Unnamed: 0,age,bmi,children,expenses
age,1.0,0.109341,0.042469,0.299008
bmi,0.109341,1.0,0.012645,0.198576
children,0.042469,0.012645,1.0,0.067998
expenses,0.299008,0.198576,0.067998,1.0


# Coorelation is only applied on the independednt continuous featues only

In [8]:
df['sex'].replace({'female':0 ,'male':1},inplace = True)
df['smoker'].replace({'no':0 ,'yes':1},inplace = True)
df.drop('region', axis=1,inplace= True) # For this data region have no significance on expenses 
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,expenses
0,19,0,27.9,0,1,16884.92
1,18,1,33.8,1,0,1725.55
2,28,1,33.0,3,0,4449.46
3,33,1,22.7,0,0,21984.47
4,32,1,28.9,0,0,3866.86


In [9]:
X = df[['age','sex','bmi','children','smoker']]
y = df['expenses']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
elastic_base = ElasticNet()
elastic_base.fit(X_train, y_train)

ElasticNet()

In [10]:
print(elastic_base.intercept_)
print(elastic_base.coef_)

-7253.04198889761
[ 247.37103137  385.36890172  293.67139989  456.14967235 5862.5715928 ]


In [11]:
#Training data 

pred_train = elastic_base.predict(X_train)
mse = mean_squared_error(y_train, pred_train)
rmse = np.sqrt(mse)
r2 = r2_score(y_train,pred_train)

print("Root Mean Squared Error:", rmse.round().astype(int))
print("R-squared:", r2)

Root Mean Squared Error: 9484
R-squared: 0.39155822533558726


In [12]:
#Testing Data
pred_test = elastic_base.predict(X_test)
mse = mean_squared_error(y_test,pred_test)
rmse = np.sqrt(mse)
r2 = r2_score(y_test,pred_test)

print("Root Mean Squared Error:", rmse.round().astype(int))
print("R-squared:", r2)

Root Mean Squared Error: 9233
R-squared: 0.3970277641347352


In [13]:
#Calculating cross validation matrics

scores = cross_val_score(elastic_base,X,y,cv=5)
print(scores)
print('Cross Validation :', scores.mean())


[0.39737873 0.39716117 0.38048702 0.40410627 0.36549203]
Cross Validation : 0.3889250431216654


<h1> ---------------------------------------------------------------------------------------</h1>

# Calculating the best alpha and lamda


In [14]:
estimator = ElasticNet()
param_grid = {'alpha' : [0.1,0.2,1,2,3,4,5,10] , "l1_ratio" : [0.1,0.5,0.75,0.9,0.95,1]}
model_hp = GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model_hp.fit(X_train, y_train)
model_hp.best_params_

{'alpha': 10, 'l1_ratio': 1}

<h1> Rebuild the model with best alpha and lamda</h1>

In [15]:
elastic_best = ElasticNet(alpha=10,l1_ratio=1)
elastic_best.fit(X_train, y_train)

print(elastic_best.intercept_)
print(elastic_best.coef_)

pred_train = elastic_best.predict(X_train)
pred_test = elastic_best.predict(X_test)

print('Train R2:', r2_score(y_train,pred_train))
print('Train R2:', r2_score(y_test,pred_test))
print('Cross Validation :', cross_val_score(elastic_best,X,y,cv=5).mean())
X_train.head()

-11449.28756082979
[ 2.56838444e+02 -6.43158858e-01  3.04860929e+02  4.34656692e+02
  2.35631810e+04]
Train R2: 0.7433083585849637
Train R2: 0.7755411716841649
Cross Validation : 0.7467299170217538


Unnamed: 0,age,sex,bmi,children,smoker
333,56,0,28.8,0,0
644,43,1,35.3,2,0
1134,28,1,33.8,0,0
852,46,0,35.5,0,1
514,39,1,28.3,1,1


In [16]:
input_data = {'age':31,
              "sex": "female", 
              "bmi": 25.74,
              "children":0,
              "smoker": "no",
              "region":"northeast"}

In [17]:
df_test = pd.DataFrame(input_data,index = [0])
df_test.drop('region',axis=1,inplace =True)
df_test['sex'].replace({'female':0 ,'male':1},inplace = True)
df_test['smoker'].replace({'no':0 ,'yes':1},inplace = True)
transformed_data = df_test
elastic_best.predict(df_test)

array([4359.82451623])