In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score

In [150]:
df=pd.read_csv('insurance.csv')

In [151]:
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [152]:
df.isnull().sum()

age         0
gender      0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [153]:
# dependent variable and independent variable
x = df.iloc[:,:-1]
y = df['charges']


apply label encoding on gender and smoker column

In [154]:
le = LabelEncoder()
x['smoker']=le.fit_transform(x['smoker'])
x['gender']=le.fit_transform(x['gender'])
x

Unnamed: 0,age,gender,bmi,children,smoker,region
0,19,0,27.900,0,1,southwest
1,18,1,33.770,1,0,southeast
2,28,1,33.000,3,0,southeast
3,33,1,22.705,0,0,northwest
4,32,1,28.880,0,0,northwest
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest
1334,18,0,31.920,0,0,northeast
1335,18,0,36.850,0,0,southeast
1336,21,0,25.800,0,0,southwest


apply one hot encoding on region

In [155]:
x=pd.get_dummies(x,columns=['region'])
x

Unnamed: 0,age,gender,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,0,0,0,1
1,18,1,33.770,1,0,0,0,1,0
2,28,1,33.000,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.880,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,0,1,0,0
1334,18,0,31.920,0,0,1,0,0,0
1335,18,0,36.850,0,0,0,0,1,0
1336,21,0,25.800,0,0,0,0,0,1


# scaling the data (by Min max scaler)

In [156]:

min_max = MinMaxScaler()      
scaled_data = min_max.fit_transform(x)
scaled_data                                 

array([[0.02173913, 0.        , 0.3212268 , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 1.        , 0.47914985, ..., 0.        , 1.        ,
        0.        ],
       [0.2173913 , 1.        , 0.45843422, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.56201238, ..., 0.        , 1.        ,
        0.        ],
       [0.06521739, 0.        , 0.26472962, ..., 0.        , 0.        ,
        1.        ],
       [0.93478261, 0.        , 0.35270379, ..., 1.        , 0.        ,
        0.        ]])

In [157]:
min_max = pd.DataFrame(scaled_data,columns=x.columns)
min_max

Unnamed: 0,age,gender,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,0.021739,0.0,0.321227,0.0,1.0,0.0,0.0,0.0,1.0
1,0.000000,1.0,0.479150,0.2,0.0,0.0,0.0,1.0,0.0
2,0.217391,1.0,0.458434,0.6,0.0,0.0,0.0,1.0,0.0
3,0.326087,1.0,0.181464,0.0,0.0,0.0,1.0,0.0,0.0
4,0.304348,1.0,0.347592,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1333,0.695652,1.0,0.403820,0.6,0.0,0.0,1.0,0.0,0.0
1334,0.000000,0.0,0.429379,0.0,0.0,1.0,0.0,0.0,0.0
1335,0.000000,0.0,0.562012,0.0,0.0,0.0,0.0,1.0,0.0
1336,0.065217,0.0,0.264730,0.0,0.0,0.0,0.0,0.0,1.0


# Standard scaler



In [158]:
sc = StandardScaler()
stand_scale = sc.fit_transform(x)
stand_scale

array([[-1.43876426, -1.0105187 , -0.45332   , ..., -0.56641788,
        -0.61132367,  1.76548098],
       [-1.50996545,  0.98959079,  0.5096211 , ..., -0.56641788,
         1.63579466, -0.56641788],
       [-0.79795355,  0.98959079,  0.38330685, ..., -0.56641788,
         1.63579466, -0.56641788],
       ...,
       [-1.50996545, -1.0105187 ,  1.0148781 , ..., -0.56641788,
         1.63579466, -0.56641788],
       [-1.29636188, -1.0105187 , -0.79781341, ..., -0.56641788,
        -0.61132367,  1.76548098],
       [ 1.55168573, -1.0105187 , -0.26138796, ...,  1.76548098,
        -0.61132367, -0.56641788]])

In [159]:
standard_scale = pd.DataFrame(stand_scale,columns=x.columns)
standard_scale

Unnamed: 0,age,gender,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.438764,-1.010519,-0.453320,-0.908614,1.970587,-0.565267,-0.566418,-0.611324,1.765481
1,-1.509965,0.989591,0.509621,-0.078767,-0.507463,-0.565267,-0.566418,1.635795,-0.566418
2,-0.797954,0.989591,0.383307,1.580926,-0.507463,-0.565267,-0.566418,1.635795,-0.566418
3,-0.441948,0.989591,-1.305531,-0.908614,-0.507463,-0.565267,1.765481,-0.611324,-0.566418
4,-0.513149,0.989591,-0.292556,-0.908614,-0.507463,-0.565267,1.765481,-0.611324,-0.566418
...,...,...,...,...,...,...,...,...,...
1333,0.768473,0.989591,0.050297,1.580926,-0.507463,-0.565267,1.765481,-0.611324,-0.566418
1334,-1.509965,-1.010519,0.206139,-0.908614,-0.507463,1.769076,-0.566418,-0.611324,-0.566418
1335,-1.509965,-1.010519,1.014878,-0.908614,-0.507463,-0.565267,-0.566418,1.635795,-0.566418
1336,-1.296362,-1.010519,-0.797813,-0.908614,-0.507463,-0.565267,-0.566418,-0.611324,1.765481


# Training  Testing data by standard scaler


In [160]:

x_train,x_test,y_train,y_test = train_test_split(standard_scale,y,test_size=0.2,random_state=0)

1. regression problem (linear regression model)

In [161]:
lr = LinearRegression()
lr.fit(x_train,y_train)
lr.score(x_test,y_test)

0.7999332385873761

In [162]:
#or
y_pred=lr.predict(x_test)
# metrices to evaluate model
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

3937.912823368771
0.7999332385873761


2. Support_Vector_Machine Training 

In [163]:
from sklearn.svm import SVR
regressor = SVR(C=5000,kernel="poly")
regressor.fit(x_train,y_train)
y_pred = regressor.predict(x_test)
print(r2_score(y_test,y_pred))

0.8589159873902013


3. Decision Tree

In [167]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth=10,min_samples_split=20)
regressor.fit(x_train,y_train)
y_pred = regressor.predict(x_test)
print(r2_score(y_test,y_pred))

0.8290653750696646


# **Training testing data by minmax scaler**

In [168]:
x_train,x_test,y_train,y_test = train_test_split(min_max,y,test_size=0.2,random_state=0)

1. regression problem (linear regression model)


In [169]:
lr.fit(x_train,y_train)
pred_y = lr.predict(x_test)

lr.score(x_test,y_test)

0.7999876970680433

2.Support_Vector_Machine Training 

In [170]:
from sklearn.svm import SVR
regressor = SVR(C=5000,kernel="poly")
regressor.fit(x_train,y_train)
y_pred = regressor.predict(x_test)
print(r2_score(y_test,y_pred))

0.8718086143332975


3. Decision Tree

In [171]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth=10,min_samples_split=20)
regressor.fit(x_train,y_train)
y_pred = regressor.predict(x_test)
print(r2_score(y_test,y_pred))

0.8290653750696646
