In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [7]:
#!pip install openpyxl

In [8]:
# Original Raw Data
df = pd.read_excel('insurance.xlsx')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [10]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [12]:
categorical = []
continuous = []
check = []

d_types = dict(df.dtypes)
for name, type in d_types.items():
    if str(type) == 'object':
        categorical.append(name)
    elif str(type) == 'float64': 
        continuous.append(name)
    else:
        check.append(name)
        
print("Catrgorical Features ",categorical)
print("Continuous Features ",continuous) 
print("Features to check ",check) 

Catrgorical Features  ['sex', 'smoker', 'region']
Continuous Features  ['bmi', 'expenses']
Features to check  ['age', 'children']


In [13]:
d_types = dict(df.dtypes)
for name, type in d_types.items():
    if str(type) == 'object':
        print(f'<-------{name}------>')
        print(df[name].value_counts())

<-------sex------>
sex
male      676
female    662
Name: count, dtype: int64
<-------smoker------>
smoker
no     1064
yes     274
Name: count, dtype: int64
<-------region------>
region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64


In [15]:
#df.corr()

# Coorelation is only applied on the independednt continuous featues only

In [16]:
df['sex'].replace({'female':0 ,'male':1},inplace = True)
df['smoker'].replace({'no':0 ,'yes':1},inplace = True)
df.drop('region', axis=1,inplace= True) # For this data region have no significance on expenses 
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sex'].replace({'female':0 ,'male':1},inplace = True)
  df['sex'].replace({'female':0 ,'male':1},inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['smoker'].replace({'no':0 ,'yes':1},inplace = True)
  df['smoker'].replace({'no':0 ,'yes':1},inplace = True)


Unnamed: 0,age,sex,bmi,children,smoker,expenses
0,19,0,27.9,0,1,16884.92
1,18,1,33.8,1,0,1725.55
2,28,1,33.0,3,0,4449.46
3,33,1,22.7,0,0,21984.47
4,32,1,28.9,0,0,3866.86


In [17]:
X = df[['age','sex','bmi','children','smoker']]
y = df['expenses']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
ridge_base = Ridge()
ridge_base.fit(X_train, y_train)

In [18]:
print(ridge_base.intercept_)
print(ridge_base.coef_)

-11431.817771192887
[  256.78609494   -36.40162004   305.15556733   442.41060886
 23493.95299705]


In [19]:
#Training data 

pred_train = ridge_base.predict(X_train)
mse = mean_squared_error(y_train, pred_train)
rmse = np.sqrt(mse)
r2 = r2_score(y_train,pred_train)

print("Root Mean Squared Error:", rmse.round().astype(int))
print("R-squared:", r2)

Root Mean Squared Error: 6160
R-squared: 0.7432963847740973


In [20]:
#Testing Data
pred_test = ridge_base.predict(X_test)
mse = mean_squared_error(y_test,pred_test)
rmse = np.sqrt(mse)
r2 = r2_score(y_test,pred_test)

print("Root Mean Squared Error:", rmse.round().astype(int))
print("R-squared:", r2)

Root Mean Squared Error: 5634
R-squared: 0.7754580997793767


In [14]:
#Calculating cross validation matrics

scores = cross_val_score(ridge_base,X,y,cv=5)
print(scores)
print('Cross Validation :', scores.mean())


[0.76115169 0.70909184 0.77495574 0.73175189 0.75631062]
Cross Validation : 0.7466523551462287


<h1> ---------------------------------------------------------------------------------------</h1>


In [21]:
estimator = Ridge()
param_grid = {'alpha' : [0.1,0.2,0.5,0.7,1,10,40,49,50,51,60,100,1000]}
model_hp = GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model_hp.fit(X_train, y_train)
model_hp.best_params_

{'alpha': 0.1}

<h1> Rebuild the model with best alpha </h1>

In [22]:
ridge_best = Ridge(alpha=0.1)
ridge_best.fit(X_train, y_train)

print(ridge_best.intercept_)
print(ridge_best.coef_)

pred_train = ridge_best.predict(X_train)
pred_test = ridge_best.predict(X_test)

print('Train R2:', r2_score(y_train,pred_train))
print('Train R2:', r2_score(y_test,pred_test))
print('Cross Validation :', cross_val_score(ridge_best,X,y,cv=5).mean())
X_train.head()

-11456.794876086755
[  256.8498504    -44.63820232   305.23708037   441.52233136
 23614.23737882]
Train R2: 0.7433160460644062
Train R2: 0.7757479380251502
Cross Validation : 0.7466599257222319


Unnamed: 0,age,sex,bmi,children,smoker
333,56,0,28.8,0,0
644,43,1,35.3,2,0
1134,28,1,33.8,0,0
852,46,0,35.5,0,1
514,39,1,28.3,1,1


In [23]:
input_data = {'age':31,
              "sex": "female", 
              "bmi": 25.74,
              "children":0,
              "smoker": "no",
              "region":"northeast"}

In [25]:
df_test = pd.DataFrame(input_data,index = [0])
df_test.drop('region',axis=1,inplace =True)
df_test['sex'].replace({'female':0 ,'male':1},inplace = True)
df_test['smoker'].replace({'no':0 ,'yes':1},inplace = True)
transformed_data = df_test
ridge_best.predict(df_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['sex'].replace({'female':0 ,'male':1},inplace = True)
  df_test['sex'].replace({'female':0 ,'male':1},inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['smoker'].replace({'no':0 ,'yes':1},inplace = True)
  df_test['smoker'].replace({'no':0 ,'yes':1},

array([4362.35293501])