In [1]:
import pandas as pd #import pandas library to handle file, database

In [3]:
dataset = pd.read_csv("insurance_pre.csv") #read the file data and store it in dataset variable

In [5]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [7]:
#data preprocessing - because categorial data can't be handle so we modify the data into numerical data using one hot encoding(Nominal)
dataset = pd.get_dummies(dataset, drop_first=True)

In [9]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [11]:
boolean_columns = dataset.select_dtypes(include=['bool']).columns #get boolean type column data from the dataset and store it in boolean_columns variable
print(boolean_columns)

dataset[boolean_columns] = dataset[boolean_columns].astype(int) #convert boolean column dataset to 0's and 1's
print(dataset[boolean_columns])

Index(['sex_male', 'smoker_yes'], dtype='object')
      sex_male  smoker_yes
0            0           1
1            1           0
2            1           0
3            1           0
4            1           0
...        ...         ...
1333         1           0
1334         0           0
1335         0           0
1336         0           0
1337         0           1

[1338 rows x 2 columns]


In [13]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [15]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [17]:
#split input data
independent = dataset [['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [19]:
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [21]:
dependent = dataset [['charges']] #split output data

In [23]:
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR #Create a model 

param_grid = {'kernel' : ['rbf','poly','sigmoid','linear'], 'C' : [10,100,1000]}
             #, 'gamma' : ['auto','scale']}

grid = GridSearchCV(SVR(), param_grid, refit=True, verbose=3, n_jobs=-1, scoring='r2')

#grid.fit(X_train, y_train)

grid.fit(independent, dependent)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


  y = column_or_1d(y, warn=True)


In [27]:
re = grid.cv_results_

#print("CV_RESULTS : ", re)

grid_pred = grid.predict(independent) #test the model with input data

from sklearn.metrics import r2_score #Evaluation metrics

r_score = r2_score(dependent, grid_pred)
print("R score value is : ", r_score) 

print("Best Score:", grid.best_score_) #describe about model best score
print("Best Parameters:", grid.best_params_)

R score value is :  0.7179049746029832
Best Score: 0.6937782096990812
Best Parameters: {'C': 1000, 'kernel': 'linear'}


In [29]:
table = pd.DataFrame.from_dict(re) #convert dictonary format to DataFrame(table)

In [31]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.203583,0.033723,0.080742,0.021009,10,rbf,"{'C': 10, 'kernel': 'rbf'}",-0.110409,-0.097459,-0.074488,-0.099597,-0.124428,-0.101276,0.016459,5
1,0.153725,0.021848,0.031717,0.005358,10,poly,"{'C': 10, 'kernel': 'poly'}",-0.125153,-0.081505,-0.111907,-0.131602,-0.179328,-0.125899,0.031794,7
2,0.199867,0.024112,0.043394,0.010409,10,sigmoid,"{'C': 10, 'kernel': 'sigmoid'}",-0.116743,-0.119312,-0.085214,-0.10821,-0.108716,-0.107639,0.012029,6
3,0.150624,0.012957,0.025545,0.003558,10,linear,"{'C': 10, 'kernel': 'linear'}",-0.018943,0.029886,0.022167,0.003112,-0.066653,-0.006086,0.034672,3
4,0.150972,0.01105,0.070265,0.003646,100,rbf,"{'C': 100, 'kernel': 'rbf'}",-0.160182,-0.107253,-0.12343,-0.137753,-0.209551,-0.147634,0.035511,11
5,0.137178,0.010083,0.019428,0.0029,100,poly,"{'C': 100, 'kernel': 'poly'}",-0.132885,-0.082461,-0.12922,-0.139962,-0.189219,-0.134749,0.033948,8
6,0.187915,0.013813,0.049265,0.008499,100,sigmoid,"{'C': 100, 'kernel': 'sigmoid'}",-0.142455,-0.160831,-0.117544,-0.137984,-0.130402,-0.137843,0.014263,9
7,0.332562,0.017541,0.022879,0.003065,100,linear,"{'C': 100, 'kernel': 'linear'}",0.559313,0.521798,0.547691,0.538805,0.508518,0.535225,0.018122,2
8,0.187169,0.015307,0.076629,0.006968,1000,rbf,"{'C': 1000, 'kernel': 'rbf'}",-0.15543,-0.098267,-0.123724,-0.137137,-0.206405,-0.144193,0.036262,10
9,0.189333,0.010133,0.020283,0.001813,1000,poly,"{'C': 1000, 'kernel': 'poly'}",-0.080527,-0.028469,-0.073376,-0.086611,-0.134493,-0.080695,0.033795,4


In [33]:
age = int(input("Enter the age:"))
bmi = float(input("Enter the BMI:"))
children = int(input("Enter the children number:"))
sex = int(input("Enter sex (0 for female and 1 for male):"))
smoker = int(input("Enter 0 for non smoker and 1 for smoker:"))

Enter the age: 26
Enter the BMI: 20
Enter the children number: 0
Enter sex (0 for female and 1 for male): 0
Enter 0 for non smoker and 1 for smoker: 1


In [35]:
Final_prediction =grid.predict([[age,bmi,children,sex,smoker]])
print("Final_prediction of SVR: {}".format(Final_prediction))

Final_prediction of SVR: [27591.58598477]




In [37]:
import pickle

fileName = "SVR_Grid_Insurance.sav"

pickle.dump(grid, open(fileName,'wb'))

In [39]:
load_model = pickle.load(open(fileName,'rb'))

result = load_model.predict([[30,25.899,2,1,1]])
result



array([29226.75121968])