In [1]:
import pandas as pd #import pandas library to handle file, database

In [2]:
dataset = pd.read_csv("insurance_pre.csv") #read the file data and store it in dataset variable

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [7]:
#data preprocessing - because categorial data can't be handle so we modify the data into numerical data using one hot encoding(Nominal)

dataset = pd.get_dummies(dataset, drop_first=True)

dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [9]:
boolean_columns = dataset.select_dtypes(include=['bool']).columns #get boolean type column data from the dataset and store it in boolean_columns variable
print(boolean_columns)

dataset[boolean_columns] = dataset[boolean_columns].astype(int) #convert boolean column dataset to 0's and 1's
print(dataset[boolean_columns])

Index(['sex_male', 'smoker_yes'], dtype='object')
      sex_male  smoker_yes
0            0           1
1            1           0
2            1           0
3            1           0
4            1           0
...        ...         ...
1333         1           0
1334         0           0
1335         0           0
1336         0           0
1337         0           1

[1338 rows x 2 columns]


In [11]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [13]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [15]:
#split input data
independent = dataset [['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [17]:
dependent = dataset[['charges']] #split output data

dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [19]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor #Create a model

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 1, 2],
    'boosting_type': ['gbdt', 'goss']
}

grid = GridSearchCV(LGBMRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1, scoring='r2')

grid.fit(independent, dependent)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 313
[LightGBM] [Info] Number of data points in the train set: 1338, number of used features: 5
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 13270.422260


In [23]:
re = grid.cv_results_

#print("CV_RESULTS : ", re)

grid_pred = grid.predict(independent) #test the model with input data

from sklearn.metrics import r2_score #Evaluation metrics

r_score = r2_score(dependent, grid_pred)
print("R score value is : ", r_score) 

print("Best Score:", grid.best_score_)
print("Best Parameters:", grid.best_params_)

R score value is :  0.9007425687250515
Best Score: 0.8513788449177522
Best Parameters: {'boosting_type': 'goss', 'learning_rate': 0.1, 'n_estimators': 100, 'reg_alpha': 1, 'reg_lambda': 2}


In [25]:
import pandas as pd

# feature names 
feature_names = ['age', 'bmi', 'children', 'sex_male', 'smoker_yes']  

independent_df = pd.DataFrame(independent, columns=feature_names)

# Get feature importances from best model inside GridSearchCV
importance_scores = grid.best_estimator_.feature_importances_

# Create DataFrame of feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance_scores
})

# Sort by most important feature
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the result
print(importance_df)


      Feature  Importance
1         bmi         808
0         age         521
2    children         165
4  smoker_yes          59
3    sex_male          48


In [27]:
table = pd.DataFrame.from_dict(re) #convert dictonary format to DataFrame(table)

In [29]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_learning_rate,param_n_estimators,param_reg_alpha,param_reg_lambda,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.513174,0.216826,0.009507,0.002974,gbdt,0.01,100,0.0,0,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",0.767177,0.715397,0.751766,0.744336,0.739562,0.743648,0.016940,55
1,2.343209,0.214204,0.006728,0.004738,gbdt,0.01,100,0.0,1,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",0.761075,0.709426,0.742530,0.739368,0.737166,0.737913,0.016565,61
2,1.979124,0.284294,0.006093,0.004429,gbdt,0.01,100,0.0,2,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",0.754982,0.706426,0.738895,0.736112,0.734053,0.734094,0.015678,67
3,1.904961,0.041130,0.005884,0.003926,gbdt,0.01,100,0.1,0,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",0.767177,0.715343,0.751739,0.744336,0.739562,0.743631,0.016955,57
4,1.750147,0.103119,0.002623,0.003916,gbdt,0.01,100,0.1,1,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",0.761075,0.709426,0.742530,0.739368,0.737166,0.737913,0.016565,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,2.067012,0.015657,0.009611,0.002892,goss,0.10,200,0.1,1,"{'boosting_type': 'goss', 'learning_rate': 0.1...",0.864931,0.789536,0.877093,0.836166,0.850094,0.843564,0.030322,11
68,2.007532,0.012432,0.005441,0.005008,goss,0.10,200,0.1,2,"{'boosting_type': 'goss', 'learning_rate': 0.1...",0.857505,0.788123,0.879444,0.832481,0.853904,0.842291,0.030910,18
69,2.031731,0.016869,0.004064,0.002427,goss,0.10,200,1.0,0,"{'boosting_type': 'goss', 'learning_rate': 0.1...",0.862091,0.786786,0.875726,0.834912,0.852008,0.842304,0.030788,17
70,1.877169,0.099358,0.007641,0.004127,goss,0.10,200,1.0,1,"{'boosting_type': 'goss', 'learning_rate': 0.1...",0.864931,0.789537,0.877093,0.838532,0.850512,0.844121,0.030238,10


In [31]:
age = int(input("Enter the age:"))
bmi = float(input("Enter the BMI:"))
children = int(input("Enter the children number:"))
sex = int(input("Enter sex (0 for female and 1 for male):"))
smoker = int(input("Enter 0 for non smoker and 1 for smoker:"))

Enter the age: 26
Enter the BMI: 20
Enter the children number: 0
Enter sex (0 for female and 1 for male): 0
Enter 0 for non smoker and 1 for smoker: 1


In [33]:
Final_prediction =grid.predict([[age,bmi,children,sex,smoker]])
print("Final_prediction of LGboost: {}".format(Final_prediction))

Final_prediction of LGboost: [15901.63413008]


In [35]:
#save the model using pickle library

import pickle
fileName = "LGboost_grid.sav"
pickle.dump(grid, open(fileName, "wb")) #saved model available in regressor variable. Now storing that variable into fileName

In [37]:
loaded_model = pickle.load(open("LGboost_grid.sav","rb"))
result = loaded_model.predict([[30,30,2,1,1]])  #30,25.899,2,1,1



In [39]:
result

array([21791.26319833])