#                                         ~*Linear Regression Model*~
__________________________________________________________________________________________________________
**********************************************************************************************************

## import all the required libraries

In [1]:
import numpy as np
import pandas as pd 

#visualization purpose libraries
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

#To avoide warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Problem statements:-

### Aim--> ***!To predict the premium of Life Insurance!***

## 2. Data Gathering:-

In [2]:
df = pd.read_csv("medical_insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## 3. Exploratory Data Analysis  (EDA):-

### ****The goal of conducting EDA is to determine the characteristics of the dataset.***

In [5]:
df['age']

0       19
1       18
2       28
3       33
4       32
        ..
1333    50
1334    18
1335    18
1336    21
1337    61
Name: age, Length: 1338, dtype: int64

In [6]:
df['age'].unique()

array([19, 18, 28, 33, 32, 31, 46, 37, 60, 25, 62, 23, 56, 27, 52, 30, 34,
       59, 63, 55, 22, 26, 35, 24, 41, 38, 36, 21, 48, 40, 58, 53, 43, 64,
       20, 61, 44, 57, 29, 45, 54, 49, 47, 51, 42, 50, 39], dtype=int64)

In [7]:
df['sex'].value_counts().to_dict()

{'male': 676, 'female': 662}

In [8]:
df['sex'].replace({'male': 1, 'female': 0},inplace=True)

In [9]:
sex_value = {'male': 1, 'female': 0}
sex_value

{'male': 1, 'female': 0}

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


In [11]:
df['bmi'].isna().sum()

0

In [12]:
df['bmi']

0       27.900
1       33.770
2       33.000
3       22.705
4       28.880
         ...  
1333    30.970
1334    31.920
1335    36.850
1336    25.800
1337    29.070
Name: bmi, Length: 1338, dtype: float64

In [13]:
df['children'].isna().sum()

0

In [14]:
df['smoker'].isna().sum()

0

In [15]:
df['smoker'].value_counts().to_dict()

{'no': 1064, 'yes': 274}

In [16]:
df['smoker'].replace({'no': 0, 'yes': 1},inplace=True)


In [17]:
smoker_value = {'no': 0, 'yes': 1}
smoker_value

{'no': 0, 'yes': 1}

In [18]:
df['region'].isna().sum()

0

In [19]:
df['region'].value_counts().to_dict()

{'southeast': 364, 'southwest': 325, 'northwest': 325, 'northeast': 324}

In [20]:
# we dont know the order of data--this is nominal data, so we go with ***OneHotEncoding***
#                                                                         --------------
#OneHotEncoding

df = pd.get_dummies(df,columns=['region'])
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,16884.92400,0,0,0,1
1,18,1,33.770,1,0,1725.55230,0,0,1,0
2,28,1,33.000,3,0,4449.46200,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,0,1,0,0
1334,18,0,31.920,0,0,2205.98080,1,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,0,1


In [21]:
df['charges'].isna().sum()

0

In [22]:
df['charges']

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   sex               1338 non-null   int64  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int64  
 5   charges           1338 non-null   float64
 6   region_northeast  1338 non-null   uint8  
 7   region_northwest  1338 non-null   uint8  
 8   region_southeast  1338 non-null   uint8  
 9   region_southwest  1338 non-null   uint8  
dtypes: float64(2), int64(4), uint8(4)
memory usage: 68.1 KB


## Train_Test_Split

In [24]:
df = df.select_dtypes(exclude=object)

x = df.drop('charges',axis=1)
y = df['charges']
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, random_state=15)
x_train

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
1256,51,0,36.385,3,0,0,1,0,0
147,51,0,37.730,1,0,0,0,1,0
1042,20,1,30.685,0,1,1,0,0,0
889,57,1,33.630,1,0,0,1,0,0
650,49,0,42.680,2,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
1223,20,0,24.420,0,1,0,0,1,0
667,40,0,32.775,2,1,0,1,0,0
156,48,1,24.420,0,1,0,0,1,0
384,44,1,22.135,2,0,1,0,0,0


## Model Training

### ~To create instance of model~

In [25]:
knn_reg = KNeighborsRegressor()
knn_reg.fit(x_train,y_train)

##  Model Evaluation

In [28]:
#testing data

y_pred_test = knn_reg.predict(x_test)

mse = mean_squared_error(y_test,y_pred_test)
print('MSE-->',mse)

rmse = np.sqrt(mse)
print('RMSE-->',rmse)

mae = mean_absolute_error(y_test,y_pred_test)
print('MAE-->',mae)

r2 = r2_score(y_test,y_pred_test)
print('R2-->',r2)

MSE--> 115021593.58796217
RMSE--> 10724.812053736054
MAE--> 7426.788543011194
R2--> 0.11776018039085168


In [27]:
#training data

y_pred_train = knn_reg.predict(x_train)

mse = mean_squared_error(y_train,y_pred_train)
print('MSE-->',mse)

rmse = np.sqrt(mse)
print('RMSE-->',rmse)

mae = mean_absolute_error(y_train,y_pred_train)
print('MAE-->',mae)

r2 = r2_score(y_train,y_pred_train)
print('R2-->',r2)

MSE--> 84057084.94747229
RMSE--> 9168.265100196017
MAE--> 6214.676796250467
R2--> 0.4405288607623571


## To get best values for k and p we perform Hyperparametric Tunning

In [29]:
# Estimator
knn_reg = KNeighborsRegressor()

# param_grid
hyperparameters = {"n_neighbors" : np.arange(3,30),
                  "p": [1,2]}

gscv_knn_reg = GridSearchCV(knn_reg, hyperparameters, cv = 5)
gscv_knn_reg.fit(x_train, y_train)
gscv_knn_reg.best_estimator_

In [30]:
best_params = gscv_knn_reg.best_estimator_
best_params

In [31]:
# Testing Data Evaluation

# To get best model from hyperparameter tunning
knn_reg1 = gscv_knn_reg.best_estimator_

y_pred = knn_reg1.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE :",mae)

r2 = r2_score(y_test, y_pred)
print("R Squared :",r2)

MSE : 85059892.78459449
RMSE : 9222.792027612597
MAE : 6792.542663948622
R Squared : 0.3475727285167092


In [32]:
# Training Data Evaluation
knn_reg1 = gscv_knn_reg.best_estimator_

y_pred_train = knn_reg1.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("MAE :",mae)

r2 = r2_score(y_train, y_pred_train)
print("R Squared :",r2)

MSE : 88167361.53999752
RMSE : 9389.747682445866
MAE : 6698.639759432063
R Squared : 0.4131714865536401


## Scaling 

### 1Normalization

In [33]:
normal_scalar = MinMaxScaler()
normal_scalar.fit(x_train)

In [35]:
#To get accuracy of Training dataset
array = normal_scalar.transform(x_train) #actual application the formaulae
array

x_train_normal = pd.DataFrame(array,columns=x_train.columns)

# Model Training
knn_reg = KNeighborsRegressor()  # n_neighbour = 5, P= 2
knn_reg.fit(x_train_normal, y_train)

y_pred_train = knn_reg.predict(x_train_normal)

mse = mean_squared_error(y_train, y_pred_train)
print("MSE :", mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("MAE :",mae)

r2_value = r2_score(y_train, y_pred_train)
print("R-Score :",r2_value)


MSE : 23561967.424749333
RMSE : 4854.067101385119
MAE : 2914.835715678505
R-Score : 0.8431751378715739


In [36]:
# To get testing accuracy

array = normal_scalar.transform(x_test)

x_test_normal = pd.DataFrame(array, columns=x_test.columns)

y_pred = knn_reg.predict(x_test_normal)

mse = mean_squared_error(y_test, y_pred)
print("MSE :", mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE :",mae)

r2_value = r2_score(y_test, y_pred)
print("R-Score :",r2_value)

MSE : 25392020.937125824
RMSE : 5039.049606535525
MAE : 3275.848285406716
R-Score : 0.8052378577597266


## standardization

In [37]:
std_scalar = StandardScaler()

std_scalar.fit(x_train)

In [38]:
# To get training accuracy

array = std_scalar.transform(x_train)  # Actual application the formalae
array

x_train_normal = pd.DataFrame(array, columns=x_train.columns)

# Model Training
knn_reg = KNeighborsRegressor()  # n_neighbour = 5, P= 2
knn_reg.fit(x_train_normal, y_train)

y_pred_train = knn_reg.predict(x_train_normal)

mse = mean_squared_error(y_train, y_pred_train)
print("MSE :", mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("MAE :",mae)

r2_value = r2_score(y_train, y_pred_train)
print("R-Score :",r2_value)

MSE : 21448359.6342268
RMSE : 4631.237376147632
MAE : 2804.2308450373835
R-Score : 0.8572429890134992


In [39]:
# To get testing accuracy

array = std_scalar.transform(x_test)

x_test_normal = pd.DataFrame(array, columns=x_test.columns)

y_pred = knn_reg.predict(x_test_normal)

mse = mean_squared_error(y_test, y_pred)
print("MSE :", mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE :",mae)

r2_value = r2_score(y_test, y_pred)
print("R-Score :",r2_value)

MSE : 23512693.44062964
RMSE : 4848.988909105654
MAE : 3152.097299317164
R-Score : 0.8196526950070244


## Testing on single row

In [40]:
column_names = x.columns
len(column_names)

9

In [41]:
# Testing on single row

column_names = x.columns
len(column_names)

x.head(1).T

Unnamed: 0,0
age,19.0
sex,0.0
bmi,27.9
children,0.0
smoker,1.0
region_northeast,0.0
region_northwest,0.0
region_southeast,0.0
region_southwest,1.0


In [42]:
x.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [43]:
age = 45
sex = 'male'
bmi = 22.3
children = 3
smoker = 'no'
region = 'southwest'

#to find premium charges'


In [44]:
print(sex_value)
print(smoker_value)
print(column_names)

{'male': 1, 'female': 0}
{'no': 0, 'yes': 1}
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')


In [45]:
project_data  = {'sex':sex_value, 'smoker':smoker_value, 'columns':list(column_names)}
project_data

{'sex': {'male': 1, 'female': 0},
 'smoker': {'no': 0, 'yes': 1},
 'columns': ['age',
  'sex',
  'bmi',
  'children',
  'smoker',
  'region_northeast',
  'region_northwest',
  'region_southeast',
  'region_southwest']}

In [46]:
project_data['smoker'][smoker]

0

In [47]:
region = "region_" + region
region

'region_southwest'

In [48]:
region_index = np.where(column_names == region)[0][0]
region_index

8

In [49]:
array = np.zeros(len(column_names), dtype = int)
array

array([0, 0, 0, 0, 0, 0, 0, 0, 0])

In [50]:
array = np.zeros(x.shape[1])
array[0] = age
array[1] = project_data['sex'][sex]
array[2] = bmi
array[3] = children
array[4] = project_data['smoker'][smoker]

array[region_index] = 1

print(array)

[45.   1.  22.3  3.   0.   0.   0.   0.   1. ]


# Predicted charges

In [51]:
predicted_insurance_premium = knn_reg.predict([array])[0]
predicted_insurance_premium = np.around(predicted_insurance_premium,3)
print('Pridicted Insurance Premium: ',predicted_insurance_premium, 'Rs. Only')

Pridicted Insurance Premium:  13953.716 Rs. Only


In [52]:
predicted_insurance_premium = knn_reg1.predict([array])[0]
predicted_insurance_premium = np.around(predicted_insurance_premium,3)
print('Pridicted Insurance Premium: ',predicted_insurance_premium, 'Rs. Only')

Pridicted Insurance Premium:  9616.75 Rs. Only


## Create Pickel file

In [55]:
import pickle

with open("knn_model.pkl","wb") as f:
    pickle.dump(knn_reg,f)
    

In [56]:

with open("Scalar.pkl","wb") as f:
    pickle.dump(std_scalar,f)
    

In [57]:
import json

with open("project_data.json","w") as f:
    json.dump(project_data,f)

In [58]:
json_data = {'sex': {'female': 0, 'male': 1},
 'smoker': {'yes': 0, 'no': 1},
 'columns': ['age',
  'sex',
  'bmi',
  'children',
  'smoker',
  'region_northeast',
  'region_northwest',
  'region_southeast',
  'region_southwest']}
len(json_data['columns'])

9

### Thank you Satyajit sir!!!