In [1]:
import pandas as pd
import lightgbm as lgb


In [2]:
Dataset=pd.read_csv("insurance_pre.csv")
Dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
Dataset.columns


Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges'], dtype='object')

In [4]:
# Here categorical data(state) is available in the dataset which is converted to numerical data 
# As the categorical data available is nominal, one hot coding method is used to convert it into numerical data
Dataset=pd.get_dummies(Dataset,dtype=int,drop_first=True)
Dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [5]:
Dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [6]:
#Input and output split
independent=Dataset[['age', 'bmi', 'children','sex_male','smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [7]:
Dependent=Dataset[["charges"]]
Dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [8]:
from sklearn.model_selection import GridSearchCV
param_grid={'boosting':['gbdt','dart','rf'],
            'n_estimators':[100,200,300,400,500],
            'num_leaves':[10,40],
            'max_depth':[3,5],
            'early_stopping_round':[50,100],
            'bagging_freq':[1,2],
            'bagging_fraction':[0.9,0.8,0.7,0.6,0.5],
            'metric':['rmse','L1','L2']}
grid=GridSearchCV(lgb.LGBMRegressor(),param_grid,refit=True,verbose=3,n_jobs=-1)
grid.fit(independent,Dependent,eval_set=(independent,Dependent))

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 313
[LightGBM] [Info] Number of data points in the train set: 1338, number of used features: 5
[LightGBM] [Info] Start training from score 13270.422260






In [9]:
re=grid.cv_results_

In [10]:
table=pd.DataFrame.from_dict(re)

In [11]:
table.to_excel("grid_search_results_LGBM.xlsx",index=False)

In [37]:
grid_predict=grid.predict(independent)



In [39]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bagging_fraction,param_bagging_freq,param_boosting,param_early_stopping_round,param_max_depth,param_metric,...,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.464970,0.063391,0.005762,0.000451,0.9,1,gbdt,50,3,rmse,...,10,"{'bagging_fraction': 0.9, 'bagging_freq': 1, '...",0.882525,0.809391,0.892301,0.850707,0.861263,0.859237,0.028972,117
1,0.453169,0.074004,0.007909,0.003842,0.9,1,gbdt,50,3,rmse,...,40,"{'bagging_fraction': 0.9, 'bagging_freq': 1, '...",0.882525,0.809391,0.892301,0.850707,0.861263,0.859237,0.028972,117
2,0.524547,0.107727,0.004795,0.000545,0.9,1,gbdt,50,3,rmse,...,10,"{'bagging_fraction': 0.9, 'bagging_freq': 1, '...",0.875458,0.802813,0.885177,0.847763,0.856289,0.853500,0.028620,1640
3,0.401197,0.033002,0.005703,0.001564,0.9,1,gbdt,50,3,rmse,...,40,"{'bagging_fraction': 0.9, 'bagging_freq': 1, '...",0.875458,0.802813,0.885177,0.847763,0.856289,0.853500,0.028620,1640
4,0.514192,0.012701,0.004968,0.000732,0.9,1,gbdt,50,3,rmse,...,10,"{'bagging_fraction': 0.9, 'bagging_freq': 1, '...",0.871003,0.795548,0.879936,0.845660,0.852410,0.848911,0.029397,2615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,0.698382,0.158859,0.004447,0.000387,0.5,2,rf,100,5,L2,...,40,"{'bagging_fraction': 0.5, 'bagging_freq': 2, '...",0.882067,0.806193,0.887782,0.843222,0.866822,0.857217,0.029810,643
3596,0.440750,0.172222,0.004366,0.000378,0.5,2,rf,100,5,L2,...,10,"{'bagging_fraction': 0.5, 'bagging_freq': 2, '...",0.882291,0.807318,0.886010,0.842104,0.866364,0.856817,0.029185,797
3597,0.665257,0.145586,0.004201,0.000400,0.5,2,rf,100,5,L2,...,40,"{'bagging_fraction': 0.5, 'bagging_freq': 2, '...",0.882067,0.806193,0.887782,0.843222,0.866822,0.857217,0.029810,643
3598,0.435477,0.164469,0.004219,0.000508,0.5,2,rf,100,5,L2,...,10,"{'bagging_fraction': 0.5, 'bagging_freq': 2, '...",0.882291,0.807318,0.886010,0.842104,0.866364,0.856817,0.029185,797


In [13]:
# evaluating the model performance
from sklearn.metrics import r2_score
r_score=r2_score(Dependent,grid_predict)

In [14]:
print("The R_score value for Best parameter{}".format(grid.best_params_),r_score)

The R_score value for Best parameter{'bagging_fraction': 0.9, 'bagging_freq': 2, 'boosting': 'dart', 'early_stopping_round': 50, 'max_depth': 3, 'metric': 'rmse', 'n_estimators': 400, 'num_leaves': 10} 0.8839298937742036


In [15]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker yes 0 or 1:"))

Age: 46
BMI: 26
children: 3
Sex Male 0 or 1: 1
Smoker yes 0 or 1: 1


In [35]:
Prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Prediction={}".format(Prediction))

Prediction=[24160.90526023]
