In [2]:
import numpy as np
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Medical_Insurance/Train_Data.csv')
train_df.head()

Unnamed: 0,age,sex,bmi,smoker,region,children,charges
0,21.0,male,25.745,no,northeast,2,3279.86855
1,36.976978,female,25.744165,yes,southeast,3,21454.494239
2,18.0,male,30.03,no,southeast,1,1720.3537
3,37.0,male,30.676891,no,northeast,3,6801.437542
4,58.0,male,32.01,no,southeast,1,11946.6259


In [4]:
train_df.describe()

Unnamed: 0,age,bmi,children,charges
count,3630.0,3630.0,3630.0,3630.0
mean,38.887036,30.629652,2.503581,12784.808644
std,12.151029,5.441307,1.712568,10746.166743
min,18.0,15.96,0.0,1121.8739
25%,29.0,26.694526,1.0,5654.818262
50%,39.170922,30.2,3.0,9443.807222
75%,48.343281,34.1,4.0,14680.407505
max,64.0,53.13,5.0,63770.42801


In [5]:
train_df.isnull().sum()

age         0
sex         0
bmi         0
smoker      0
region      0
children    0
charges     0
dtype: int64

In [6]:
correlation = train_df.corr()
train_df.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.143527,-0.061076,0.299692
bmi,0.143527,1.0,-0.041996,0.211325
children,-0.061076,-0.041996,1.0,-0.075089
charges,0.299692,0.211325,-0.075089,1.0


In [7]:
Y = train_df['charges']
Y

0        3279.868550
1       21454.494239
2        1720.353700
3        6801.437542
4       11946.625900
            ...     
3625    10987.324964
3626    11735.844352
3627    10602.385000
3628     8976.140452
3629     7027.698968
Name: charges, Length: 3630, dtype: float64

In [8]:
regressor = RandomForestRegressor(n_estimators=100)

In [9]:
label_encoder = LabelEncoder()

In [10]:
labels = label_encoder.fit_transform(train_df.sex)
labels1 = label_encoder.fit_transform(train_df.smoker)
labels2 = label_encoder.fit_transform(train_df.region)

In [11]:
train_df['sex'] = labels
train_df['smoker'] = labels1
train_df['region'] = labels2
train_df

Unnamed: 0,age,sex,bmi,smoker,region,children,charges
0,21.000000,1,25.745000,0,0,2,3279.868550
1,36.976978,0,25.744165,1,2,3,21454.494239
2,18.000000,1,30.030000,0,2,1,1720.353700
3,37.000000,1,30.676891,0,0,3,6801.437542
4,58.000000,1,32.010000,0,2,1,11946.625900
...,...,...,...,...,...,...,...
3625,48.820767,0,41.426984,0,1,4,10987.324964
3626,38.661977,0,26.202557,0,2,2,11735.844352
3627,56.000000,1,40.300000,0,3,0,10602.385000
3628,48.061207,0,34.930624,0,2,1,8976.140452


In [12]:
X = train_df.drop(['charges'],axis=1)
X

Unnamed: 0,age,sex,bmi,smoker,region,children
0,21.000000,1,25.745000,0,0,2
1,36.976978,0,25.744165,1,2,3
2,18.000000,1,30.030000,0,2,1
3,37.000000,1,30.676891,0,0,3
4,58.000000,1,32.010000,0,2,1
...,...,...,...,...,...,...
3625,48.820767,0,41.426984,0,1,4
3626,38.661977,0,26.202557,0,2,2
3627,56.000000,1,40.300000,0,3,0
3628,48.061207,0,34.930624,0,2,1


In [13]:
regressor.fit(X,Y)

RandomForestRegressor()

In [14]:
test_data = pd.read_csv('/content/drive/MyDrive/Medical_Insurance/Test_Data.csv')
test_data

Unnamed: 0,age,sex,bmi,smoker,region,children
0,40.000000,male,29.900000,no,southwest,2
1,47.000000,male,32.300000,no,southwest,1
2,54.000000,female,28.880000,no,northeast,2
3,37.000000,male,30.568094,no,northeast,3
4,59.130049,male,33.132854,yes,northeast,4
...,...,...,...,...,...,...
487,51.000000,male,27.740000,no,northeast,1
488,33.000000,male,42.400000,no,southwest,5
489,47.769999,male,29.064615,no,northeast,4
490,41.530738,female,24.260852,no,southeast,5


In [15]:
labels3 = label_encoder.fit_transform(test_data.sex)
labels4 = label_encoder.fit_transform(test_data.smoker)
labels5 = label_encoder.fit_transform(test_data.region)
test_data['sex'] = labels3
test_data['smoker'] = labels4
test_data['region'] = labels5
test_data

Unnamed: 0,age,sex,bmi,smoker,region,children
0,40.000000,1,29.900000,0,3,2
1,47.000000,1,32.300000,0,3,1
2,54.000000,0,28.880000,0,0,2
3,37.000000,1,30.568094,0,0,3
4,59.130049,1,33.132854,1,0,4
...,...,...,...,...,...,...
487,51.000000,1,27.740000,0,0,1
488,33.000000,1,42.400000,0,3,5
489,47.769999,1,29.064615,0,0,4
490,41.530738,0,24.260852,0,2,5


In [16]:
prediction = regressor.predict(test_data)
prediction

array([ 7507.13965419,  8778.66662874, 12500.68999395,  6855.79904948,
       40849.57759727, 12621.93158455, 11284.50754514,  6983.87404951,
        2677.75375702, 14638.35874843,  6435.30265834,  4484.41336375,
        7986.00140326,  5222.59941813,  7812.88528995,  6706.26496332,
       13479.16933149, 15316.36891577, 15127.49053184,  7706.10227011,
       11442.43176501, 12717.21691094, 34915.35438402, 11665.01938328,
       10689.91567485, 11332.93601234, 21286.35670305, 12039.22661924,
       11053.40602518, 10030.93945208,  4671.91604092,  5406.74344945,
        4708.29169419, 10994.90272729,  3181.45873286,  6539.58450356,
       35407.73677552,  5061.01289318,  8813.39961653,  4900.92996923,
       23837.70054343,  6592.11221535, 18318.5728307 ,  9662.11244296,
        6396.27281124,  8646.79283851, 11626.41767428,  4875.03330598,
        9915.55476471, 40660.40080213,  9304.49735535,  4621.86206822,
       13051.14396418, 19972.83370388, 14024.58024181, 17089.58009115,
      

In [20]:
import pickle


In [22]:
filename = 'medical_insurance.sav'
pickle.dump(regressor,open(filename,'wb'))
