In [4]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [5]:
# Original Raw Data
df = pd.read_excel('insurance.xlsx')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [8]:
categorical = []
continuous = []
check = []

d_types = dict(df.dtypes)
for name, type in d_types.items():
    if str(type) == 'object':
        categorical.append(name)
    elif str(type) == 'float64': 
        continuous.append(name)
    else:
        check.append(name)
        
print("Categorical Features ",categorical)
print("Continuous Features ",continuous) 
print("Features to check ",check) 

Categorical Features  ['sex', 'smoker', 'region']
Continuous Features  ['bmi', 'expenses']
Features to check  ['age', 'children']


In [9]:
d_types = dict(df.dtypes)
for name, type in d_types.items():
    if str(type) == 'object':
        print(f'<-------{name}------>')
        print(df[name].value_counts())

<-------sex------>
male      676
female    662
Name: sex, dtype: int64
<-------smoker------>
no     1064
yes     274
Name: smoker, dtype: int64
<-------region------>
southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64


In [10]:
df.corr()

Unnamed: 0,age,bmi,children,expenses
age,1.0,0.109341,0.042469,0.299008
bmi,0.109341,1.0,0.012645,0.198576
children,0.042469,0.012645,1.0,0.067998
expenses,0.299008,0.198576,0.067998,1.0


# Coorelation is only applied on the independednt continuous featues only

In [11]:
df['sex'].replace({'female':0 ,'male':1},inplace = True)
df['smoker'].replace({'no':0 ,'yes':1},inplace = True)
df.drop('region', axis=1,inplace= True) # For this data rfegion have no significance  
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,expenses
0,19,0,27.9,0,1,16884.92
1,18,1,33.8,1,0,1725.55
2,28,1,33.0,3,0,4449.46
3,33,1,22.7,0,0,21984.47
4,32,1,28.9,0,0,3866.86


In [12]:
X = df[['age','sex','bmi','children','smoker']]
y = df['expenses']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
lasso_base = Lasso()
lasso_base.fit(X_train, y_train)

Lasso()

In [13]:
print(lasso_base.intercept_)
print(lasso_base.coef_)

-11458.636514554386
[  256.85733518   -41.04872399   305.20721964   440.74488212
 23621.22826088]


In [14]:
#Training data 

pred_train = lasso_base.predict(X_train)
mse = mean_squared_error(y_train, pred_train)
rmse = np.sqrt(mse)
r2 = r2_score(y_train,pred_train)

print("Root Mean Squared Error:", rmse.round().astype(int))
print("R-squared:", r2)

Root Mean Squared Error: 6160
R-squared: 0.7433161675495346


In [15]:
#Testing Data
pred_test = lasso_base.predict(X_test)
mse = mean_squared_error(y_test,pred_test)
rmse = np.sqrt(mse)
r2 = r2_score(y_test,pred_test)

print("Root Mean Squared Error:", rmse.round().astype(int))
print("R-squared:", r2)

Root Mean Squared Error: 5631
R-squared: 0.7757553862206219


In [16]:
#Calculating cross validation matrics

scores = cross_val_score(lasso_base,X,y,cv=5)
print(scores)
print('Cross Validation :', scores.mean())


[0.76099933 0.70877445 0.77558184 0.73142165 0.75655517]
Cross Validation : 0.7466664879428851


<h1> ---------------------------------------------------------------------------------------</h1>


In [17]:
estimator = Lasso()
param_grid = {'alpha' : [0.1,0.2,0.5,0.7,1,10,40,49,50,51,60,100,1000]}
model_hp = GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model_hp.fit(X_train, y_train)
model_hp.best_params_

{'alpha': 50}

<h1> Rebuild the model with best alpha </h1>

In [19]:
lasso_best = Lasso(alpha=50)
lasso_best.fit(X_train, y_train)

print(lasso_best.intercept_)
print(lasso_best.coef_)

pred_train = lasso_best.predict(X_train)
pred_test = lasso_best.predict(X_test)

print('Train R2:', r2_score(y_train,pred_train))
print('Train R2:', r2_score(y_test,pred_test))
print('Cross Validation :', cross_val_score(lasso_best,X,y,cv=5).mean())
X_train.head()

-11336.591469036648
[  256.68842008    -0.           303.91129506   408.88291866
 23323.3694514 ]
Train R2: 0.743200050042114
Train R2: 0.7747832866417829
Cross Validation : 0.7468453212720682


Unnamed: 0,age,sex,bmi,children,smoker
333,56,0,28.8,0,0
644,43,1,35.3,2,0
1134,28,1,33.8,0,0
852,46,0,35.5,0,1
514,39,1,28.3,1,1


In [20]:
input_data = {'age':31,
              "sex": "female", 
              "bmi": 25.74,
              "children":0,
              "smoker": "no",
              "region":"northeast"}

In [21]:
df_test = pd.DataFrame(input_data,index = [0])
df_test.drop('region',axis=1,inplace =True)
df_test['sex'].replace({'female':0 ,'male':1},inplace = True)
df_test['smoker'].replace({'no':0 ,'yes':1},inplace = True)
#transformed_data = df_test.drop(df_test.columns[[4]],axis=1)
df_test
#transformed_data
lasso_best.predict(df_test)

array([4443.42628845])

<h1> Final Model with best alpha and dropping unimportant columns </h1>

In [22]:
X1 = df[['age','bmi','children','smoker']]
y1 = df['expenses']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=9)

#Modelling
lasso_best1 = Lasso(alpha=50)
lasso_best1.fit(X1_train, y1_train)

#prediction 
pred_train1 = lasso_best1.predict(X1_train)
pred_test1 = lasso_best1.predict(X1_test)

#Evaluation 
print('Train R2:', r2_score(y1_train,pred_train1))
print('Train R2:', r2_score(y1_test,pred_test1))
print('Cross Validation :', cross_val_score(lasso_best1,X,y,cv=5).mean())

Train R2: 0.743200076877043
Train R2: 0.7747836916908192
Cross Validation : 0.7468453212720682


In [23]:
input_data = {'age':31,
              "sex": "female", 
              "bmi": 25.74,
              "children":0,
              "smoker": "no",
              "region":"northeast"}

In [24]:
df_test = pd.DataFrame(input_data,index = [0])
df_test.drop('sex',axis=1,inplace =True)
df_test.drop('region',axis=1,inplace =True)
df_test['smoker'].replace({'no':0 ,'yes':1},inplace = True)
#transformed_data = df_test.drop(df_test.columns[[4]],axis=1)
df_test
#transformed_data
lasso_best1.predict(df_test)

array([4443.03401097])