In [1]:
import pandas as pd #import pandas library to handle file, database

In [2]:
dataset = pd.read_csv("insurance_pre.csv") #read the file data and store it in dataset variable

In [5]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [7]:
#data preprocessing - because categorial data can't be handle so we modify the data into numerical data using one hot encoding(Nominal)
dataset = pd.get_dummies(dataset, drop_first=True)

In [9]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [11]:
boolean_columns = dataset.select_dtypes(include=['bool']).columns #get boolean type column data from the dataset and store it in boolean_columns variable
print(boolean_columns)

dataset[boolean_columns] = dataset[boolean_columns].astype(int) #convert boolean column dataset to 0's and 1's
print(dataset[boolean_columns])

Index(['sex_male', 'smoker_yes'], dtype='object')
      sex_male  smoker_yes
0            0           1
1            1           0
2            1           0
3            1           0
4            1           0
...        ...         ...
1333         1           0
1334         0           0
1335         0           0
1336         0           0
1337         0           1

[1338 rows x 2 columns]


In [13]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [15]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [17]:
#split input data
independent = dataset [['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [19]:
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [21]:
dependent = dataset [['charges']] #split output data

In [23]:
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [25]:
from sklearn.model_selection import train_test_split #split the training and testing dataset

X_train,X_test,y_train,y_test = train_test_split(independent, dependent, test_size=0.30, random_state=0)

In [27]:
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [29]:
# Get mean and standard deviation
mean_values = sc.mean_  # Mean of the features
std_values = sc.scale_  # Standard deviation of the features

print("Mean:", mean_values)
print("Standard Deviation:", std_values)


Mean: [39.47115385 30.75236645  1.08119658  0.48824786  0.20299145]
Standard Deviation: [14.0050823   6.23225555  1.20355113  0.49986187  0.40222621]


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor #Create a model from training set

param_grid = {'criterion' : ['squared_error','friedman_mse','absolute_error','poisson'], 'n_estimators' : [10,50,100]
             , 'max_features' : [None,'sqrt']}

grid = GridSearchCV(RandomForestRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1, scoring='r2')

grid.fit(X_train, y_train)

#grid.fit(independent, dependent)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  return fit_method(estimator, *args, **kwargs)


In [35]:
re = grid.cv_results_

#print("CV_RESULTS : ", re)

"""y_pred = grid.predict(X_test) #test the model with test data

from sklearn.metrics import r2_score #Evaluation metrics

r_score = r2_score(y_test, y_pred)
print("R score value is : ", r_score) """

print("Best Score:", grid.best_score_)
print("Best Parameters:", grid.best_params_)

Best Score: 0.8115882573743256
Best Parameters: {'criterion': 'poisson', 'max_features': 'sqrt', 'n_estimators': 100}


In [37]:
table = pd.DataFrame.from_dict(re) #convert dictonary format to DataFrame(table)

In [39]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.092995,0.022321,0.006001,0.003463,squared_error,,10,"{'criterion': 'squared_error', 'max_features':...",0.850792,0.780558,0.799501,0.798791,0.761744,0.798277,0.029693,16
1,0.345861,0.023076,0.016219,0.008806,squared_error,,50,"{'criterion': 'squared_error', 'max_features':...",0.860555,0.758459,0.807892,0.801398,0.771321,0.799925,0.035438,14
2,0.736943,0.15746,0.01996,0.005208,squared_error,,100,"{'criterion': 'squared_error', 'max_features':...",0.86544,0.769019,0.809756,0.806837,0.766566,0.803524,0.035888,11
3,0.105442,0.04496,0.002597,0.002369,squared_error,sqrt,10,"{'criterion': 'squared_error', 'max_features':...",0.83095,0.769002,0.788117,0.806672,0.758556,0.790659,0.026023,23
4,0.289148,0.054279,0.012146,0.002112,squared_error,sqrt,50,"{'criterion': 'squared_error', 'max_features':...",0.861567,0.788638,0.802567,0.827425,0.763533,0.808746,0.033543,7
5,0.47052,0.052469,0.020755,0.001173,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.865804,0.786256,0.80588,0.819831,0.766307,0.808816,0.033745,6
6,0.082887,0.024452,0.006789,0.003847,friedman_mse,,10,"{'criterion': 'friedman_mse', 'max_features': ...",0.835574,0.739412,0.807126,0.818752,0.77352,0.794877,0.034377,21
7,0.299448,0.009041,0.012481,0.002857,friedman_mse,,50,"{'criterion': 'friedman_mse', 'max_features': ...",0.865362,0.771796,0.809963,0.805763,0.767764,0.804129,0.035083,10
8,0.640706,0.080417,0.022206,0.009725,friedman_mse,,100,"{'criterion': 'friedman_mse', 'max_features': ...",0.8692,0.763278,0.811671,0.798232,0.769754,0.802427,0.037847,12
9,0.139242,0.124629,0.008259,0.00413,friedman_mse,sqrt,10,"{'criterion': 'friedman_mse', 'max_features': ...",0.845233,0.791649,0.792685,0.804303,0.750239,0.796822,0.030376,17


In [43]:
y_pred = grid.predict(X_test)

In [45]:
from sklearn.metrics import r2_score #Evaluation metrics

r_score = r2_score(y_test, y_pred)
r_score

0.8669379573581681

In [47]:
import pickle

fileName = "RF_Grid_WithSplitData_Insurance.sav"

pickle.dump(grid, open(fileName,'wb'))

In [49]:
load_model = pickle.load(open(fileName,'rb'))

result = load_model.predict([[30,25.899,2,1,1]])
result

array([47793.5675514])