In [126]:
#!pip install lightgbm==3.3.5
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import lightgbm as lgb
 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import early_stopping

In [127]:
#Here we load the dataset from the 50_Startups.csv file
dataset = pd.read_csv("insurance.csv")
#Check the dataset data in table format
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [128]:
#pandas.get_dummies() is used for data manipulation. It converts categorical data into dummy or indicator variables.
#drop_first=True - Remove first level to get n-1 dummies out of n categorical level
dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [129]:
# it returns the column labels of the dataset
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [130]:
#Split the inputs
independent=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [131]:
#Split the output
dependent=dataset[["charges"]]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [132]:
#Train and Test data split
from sklearn.model_selection import train_test_split
X_train,X_val,Y_train,Y_val = train_test_split(independent, dependent, test_size=0.30,random_state=0)
X_train.shape,X_val.shape,Y_train.shape,Y_val.shape

((936, 5), (402, 5), (936, 1), (402, 1))

In [133]:
 #Standardize Features
 
#Use StandardScaler to scale the training and validation data
scaler = StandardScaler()
#Fit the StandardScaler to the training data
scaler.fit(X_train)
# transform both the training and validation data
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [134]:
# Create a LightGBM dataset for training with features X_train and labels Y_train
train_data = lgb.Dataset(X_train, label=Y_train)
 
# Create a LightGBM dataset for testing with features X_val and labels Y_val,
# and specify the reference dataset as train_data for consistent evaluation
test_data = lgb.Dataset(X_val, label=Y_val, reference=train_data)

In [135]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}

In [136]:
# Set the number of rounds and train the model with early stopping
num_round = 100
bst = lgb.train(params, train_data, num_round, 
                valid_sets=test_data,
               callbacks=[lgb.early_stopping(stopping_rounds=10)])


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 316
[LightGBM] [Info] Number of data points in the train set: 936, number of used features: 5
[LightGBM] [Info] Start training from score 13232.916456
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[62]	valid_0's rmse: 4266.99


In [137]:
#check the evaluate metrics
#this is used for cross check the values
#call predict function using regressor and pass the X_test as input 
y_pred=bst.predict(X_val)


In [138]:
#find the R Square value so import r2_score from sklearn.metrics 
from sklearn.metrics import r2_score
r_score=r2_score(Y_val,y_pred)
#R_score value between 0 to 1. If it is nearly to 1 then this is a good model
#our r_score is 0.93 so it is perform/learn well 
r_score

0.885826518053616