In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_excel('insurance.xlsx')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [3]:
df.shape

(1338, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
categorical = []
continous   = []
check       = []

d_types = dict(df.dtypes)
for name,type in d_types.items():
    if str(type) == 'object':
        categorical.append(name)
    elif str(type)=='float64':
         continous.append(name)
    else:
        check.append(name)
        
print('categorical features:', categorical)  
print('continous features:', continous)  
print('features to be checked:',check)  

categorical features: ['sex', 'smoker', 'region']
continous features: ['bmi', 'expenses']
features to be checked: ['age', 'children']


In [6]:
d_types = dict(df.dtypes)
for name,type in d_types.items():
    if str(type) == 'object':
        print(f'<={name}=')
        print(df[name].value_counts())

<=sex=
sex
male      676
female    662
Name: count, dtype: int64
<=smoker=
smoker
no     1064
yes     274
Name: count, dtype: int64
<=region=
region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64


In [7]:
df.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


In [8]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [9]:
df.drop(columns=['region'],axis=1,inplace=True)
df

Unnamed: 0,age,sex,bmi,children,smoker,expenses
0,19,female,27.9,0,yes,16884.92
1,18,male,33.8,1,no,1725.55
2,28,male,33.0,3,no,4449.46
3,33,male,22.7,0,no,21984.47
4,32,male,28.9,0,no,3866.86
...,...,...,...,...,...,...
1333,50,male,31.0,3,no,10600.55
1334,18,female,31.9,0,no,2205.98
1335,18,female,36.9,0,no,1629.83
1336,21,female,25.8,0,no,2007.95


In [10]:
df['sex'].replace({'female':0,'male':1},inplace=True)

df['smoker'].replace({'no':0,'yes':1},inplace=True)

In [11]:
df

Unnamed: 0,age,sex,bmi,children,smoker,expenses
0,19,0,27.9,0,1,16884.92
1,18,1,33.8,1,0,1725.55
2,28,1,33.0,3,0,4449.46
3,33,1,22.7,0,0,21984.47
4,32,1,28.9,0,0,3866.86
...,...,...,...,...,...,...
1333,50,1,31.0,3,0,10600.55
1334,18,0,31.9,0,0,2205.98
1335,18,0,36.9,0,0,1629.83
1336,21,0,25.8,0,0,2007.95


In [12]:
df.corr()

Unnamed: 0,age,sex,bmi,children,smoker,expenses
age,1.0,-0.020856,0.109341,0.042469,-0.025019,0.299008
sex,-0.020856,1.0,0.04638,0.017163,0.076185,0.057292
bmi,0.109341,0.04638,1.0,0.012645,0.003968,0.198576
children,0.042469,0.017163,0.012645,1.0,0.007673,0.067998
smoker,-0.025019,0.076185,0.003968,0.007673,1.0,0.787251
expenses,0.299008,0.057292,0.198576,0.067998,0.787251,1.0


In [13]:
x=df.drop('expenses',axis=1)
y=df['expenses']

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=9)

In [15]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
Ridge_base = Ridge()
Ridge_base.fit(x_train,y_train)

ypred_train = Ridge_base.predict(x_train)
ypred_test = Ridge_base.predict(x_test)

print('train R2:', Ridge_base.score(x_train,y_train))
print('test R2:', Ridge_base.score(x_test,y_test))

print('Cross Validation score:', cross_val_score(Ridge_base,x,y,cv=5).mean())

train R2: 0.7432963847740974
test R2: 0.7754580997793767
Cross Validation score: 0.7466523551462286


**Applying HyperParameter Tuning For Ridge Regression

In [None]:
from sklearn.model_selection import GridSearchCV

#model

estimator = Ridge()

#parameters & Values 

param_grid = {'alpha':[0.1,0.2,0.3,0.4,0.5,1,10,50,100]}

#identify the best values from the parameter within given values for the given data

model_hp = GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model_hp.fit(x_train,y_train)
model_hp.best_params_

**Rebuilt Lasso Model Using  Best Parameters

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

ridge_best = Ridge(alpha=50)

ridge_best.fit(x_train,y_train)

print('Intercept:',ridge_best.intercept_)
print('Coefficients:',ridge_best.coef_)

#predictions
train_pred = ridge_best.predict(x_train)
test_pred = ridge_best.predict(x_test)

#evaluations

print('train R2:', ridge_best.score(x_train,y_train))
print('test R2:', ridge_best.score(x_test,y_test))

print('Cross Validation score:', cross_val_score(ridge_best,x,y,cv=5).mean())

**FINAL MODEL

In [18]:
x= x.drop(x.columns[[1]],axis=1)
y = df['expenses']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=9)

from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

ridge_best = Ridge(alpha=50)

ridge_best.fit(x_train,y_train)

print('Intercept:',ridge_best.intercept_)
print('Coefficients:',ridge_best.coef_)

#predictions
train_pred = ridge_best.predict(x_train)
test_pred = ridge_best.predict(x_test)

#evaluations

print('train R2:', ridge_best.score(x_train,y_train))
print('test R2:', ridge_best.score(x_test,y_test))
print('Cross Validation score:', cross_val_score(ridge_best,x,y,cv=5).mean())


Intercept: -10240.85454646538
Coefficients: [  253.97519661   302.67591886   478.04044004 18418.1501861 ]
train R2: 0.7130147668808979
test R2: 0.7351273801232491
Cross Validation score: 0.7160408609571502


**Predict On NewData

In [20]:
input_data = {'age':31,
              'sex':'female',
                'bmi':25.74,
                 'children':0,'smoker':'no','region':'northeast'}

In [21]:
df_test = pd.DataFrame(input_data,index=[0])
df_test.drop('region',axis=1,inplace=True)
df_test['sex'].replace({'female':0,'male':1},inplace=True)
df_test['smoker'].replace({'no':0,'yes':1},inplace=True)

transformed_data = df_test.drop(df_test.columns[[1]],axis=1)

**Predict

In [23]:
ridge_best.predict(transformed_data)

array([5423.25469987])