# Predicting health insurance premium using Random Forest Regressor

### importing libraries and loading the dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df= pd.read_csv("D:\ds\Insurance Premium.csv")
df.head()

Unnamed: 0,ID,Age,Gender,BMI,Children,Smoker,Region,Premium
0,1,19,female,27.9,0,yes,south,16885
1,2,18,male,33.77,1,no,east,1726
2,3,28,male,33.0,3,no,east,4449
3,4,33,male,22.705,0,no,west,21984
4,5,32,male,28.88,0,no,west,3867


### gathering information about the data:

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        1338 non-null   int64  
 1   Age       1338 non-null   int64  
 2   Gender    1338 non-null   object 
 3   BMI       1338 non-null   float64
 4   Children  1338 non-null   int64  
 5   Smoker    1338 non-null   object 
 6   Region    1338 non-null   object 
 7   Premium   1338 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 83.8+ KB


In [4]:
df.isnull().sum()

ID          0
Age         0
Gender      0
BMI         0
Children    0
Smoker      0
Region      0
Premium     0
dtype: int64

In [8]:
df.shape

(1338, 8)

In [42]:
df.dtypes

ID            int64
Age           int64
Gender        int64
BMI         float64
Children      int64
Smoker        int64
Region        int64
Premium       int64
dtype: object

In [9]:
df.describe()

Unnamed: 0,ID,Age,BMI,Children,Premium
count,1338.0,1338.0,1338.0,1338.0,1338.0
mean,669.5,39.207025,30.663397,1.094918,13270.414798
std,386.391641,14.04996,6.098187,1.205493,12110.012882
min,1.0,18.0,15.96,0.0,1122.0
25%,335.25,27.0,26.29625,0.0,4740.0
50%,669.5,39.0,30.4,1.0,9382.0
75%,1003.75,51.0,34.69375,2.0,16640.0
max,1338.0,64.0,53.13,5.0,63770.0


In [10]:
df.Gender.value_counts()

male      676
female    662
Name: Gender, dtype: int64

In [11]:
df.Smoker.value_counts()

no     1064
yes     274
Name: Smoker, dtype: int64

In [12]:
df.Region.value_counts()

east     364
south    325
west     325
north    324
Name: Region, dtype: int64

In [13]:
df.Children.value_counts()

0    574
1    324
2    240
3    157
4     25
5     18
Name: Children, dtype: int64

### encoding categorical features like gender, region, smoker 

In [14]:
df.replace({'Region': {'north':0, 'east':1, 'west':2, 'south': 3}}, inplace=True)
df.replace({'Smoker': {'no':0 , 'yes':1}}, inplace=True)
df.replace({'Gender': {'male':0 , 'female':1}}, inplace=True)

In [15]:
df.head()

Unnamed: 0,ID,Age,Gender,BMI,Children,Smoker,Region,Premium
0,1,19,1,27.9,0,1,3,16885
1,2,18,0,33.77,1,0,1,1726
2,3,28,0,33.0,3,0,1,4449
3,4,33,0,22.705,0,0,2,21984
4,5,32,0,28.88,0,0,2,3867


### declaring 'X' as independent variables/features and 'y' as the target variable

In [16]:
y= df['Premium']
y.shape`

(1338,)

In [17]:
X= df.drop(['ID' , 'Premium'], axis=1)
X.shape

(1338, 6)

In [32]:
X.head()

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region
0,-1.438764,1,-0.45332,0,1,3
1,-1.509965,0,0.509621,1,0,1
2,-0.797954,0,0.383307,3,0,1
3,-0.441948,0,-1.305531,0,0,2
4,-0.513149,0,-0.292556,0,0,2


### Standardization of the dataset

In [20]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_standardised = df[['Age', 'BMI']]
X_standardised = sc.fit_transform(X_standardised)
X_standardised

array([[-1.43876426, -0.45332   ],
       [-1.50996545,  0.5096211 ],
       [-0.79795355,  0.38330685],
       ...,
       [-1.50996545,  1.0148781 ],
       [-1.29636188, -0.79781341],
       [ 1.55168573, -0.26138796]])

In [22]:
X[['Age', 'BMI']]= pd.DataFrame(X_standardised, columns=['Age', 'BMI'])
X.head()

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region
0,-1.438764,1,-0.45332,0,1,3
1,-1.509965,0,0.509621,1,0,1
2,-0.797954,0,0.383307,3,0,1
3,-0.441948,0,-1.305531,0,0,2
4,-0.513149,0,-0.292556,0,0,2


### splitting the data into training and testing sets

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=2)

### importing Random Forest Regressor and training the model

In [25]:
from sklearn.ensemble import RandomForestRegressor
random_forest= RandomForestRegressor(random_state=2)

In [26]:
random_forest.fit(X_train, y_train)

### getting prediction

In [27]:
y_pred= random_forest.predict(X_test)
y_pred.shape

(268,)

In [28]:
y_test.shape

(268,)

In [29]:
y_pred

array([ 4375.86, 13059.23, 14976.69,  2831.32, 10249.91,  8120.04,
        3132.92,  2905.56, 21748.64,  7972.09, 12404.82,  8725.27,
       18396.9 ,  1633.77, 11141.67, 14476.1 ,  3428.54,  7306.3 ,
       19956.47,  3509.31, 13390.39,  2501.54, 39574.82, 21081.34,
       38257.2 , 11046.16,  6992.16,  9963.99,  5531.43,  3663.99,
        8083.13,  8286.36,  7357.82,  4488.42, 10148.62,  9272.22,
       38570.93,  4815.61, 18364.8 , 14538.25,  2088.58, 35694.84,
        8063.53,  1964.5 ,  8117.61,  7788.41, 10810.41, 10631.21,
        7772.99, 10648.14,  7107.26,  1331.18, 17568.25, 44061.14,
        6701.66, 12878.78,  6862.83,  9910.69,  2361.14, 40368.63,
       17495.45,  2480.41,  4543.31, 42530.96,  1300.02, 13075.54,
       12248.2 ,  5594.13, 13950.84,  8926.84, 12409.58, 14848.09,
       10411.42,  6595.9 , 14255.88,  9671.07,  1972.12,  1753.91,
        2625.79, 14284.31, 15485.07, 11259.67,  2030.78, 17631.32,
       12647.92,  3703.98, 46819.07, 14010.27, 10495.36,  7415

### evaluating our model

In [31]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8356213559505736

### its your turn!

In [69]:
import warnings
warnings.filterwarnings('ignore')

In [76]:
def predict_output():
    age= float(input('enter your age '))
    gen= float(input('press 0 for male, 1 for female '))
    bmi= float(input('enter your bmi '))
    children= float(input('how many children do you have? '))
    smoke= float(input('press 0 if you do not smoke, 1 if you do '))
    reg= float(input('press 0 if reside in north, 1 if in east, 2 if in west, and 3 if in south '))
    
    
    
    X_new = np.array([[age , gen , bmi, children, smoke, reg]])
    
    
    
    Xnew_standardised = X_new[:, [0, 2]]
    Xnew_standardised = sc.transform(Xnew_standardised)
    X_new[:, [0]]= Xnew_standardised[:, [0]]
    X_new[:, [2]] =Xnew_standardised[:, [1]]
    
    print('your premium: ')
    return random_forest.predict(X_new)[0]
    
    

In [77]:
predict_output()

enter your age 19
press 0 for male, 1 for female 1
enter your bmi 22
how many children do you have? 0
press 0 if you do not smoke, 1 if you do 0
press 0 if reside in north, 1 if in east, 2 if in west, and 3 if in south 0
your premium: 


2614.97