In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


In [None]:
data = pd.read_csv(r'../input/insurance-premium-prediction/insurance.csv')

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data.nunique()

In [None]:
data.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var = ['sex','smoker','region']
for i in var:
    data[i] = le.fit_transform(data[i])

In [None]:
X = data.drop('expenses',axis=1)
y = data['expenses']

In [None]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

def fit_lgb(trial, x_train, y_train, x_test, y_test):
    params = {
        "metric": "RMSE",
        "boosting_type": "gbdt",
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.5),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    
    
    model = LGBMRegressor(**params, random_state=2021)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict(x_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train rmse": mean_squared_error(y_train, y_train_pred,squared=False),
        "valid rmse": mean_squared_error(y_test, y_test_pred,squared=False)
    }
    
    return model, log

In [None]:
def objective(trial):
    rmse = 0
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
    model, log = fit_lgb(trial, x_train, y_train, x_test, y_test)
    rmse += log['valid rmse']
        
    return rmse

In [None]:
lgb_params = {'learning_rate': 0.4616962246009375,
 'lambda_l1': 1.0056418409225514e-08,
 'lambda_l2': 0.0034886051949242197,
 'num_leaves': 56,
 'feature_fraction': 0.9947122992121118,
 'bagging_fraction': 0.8304993202512568,
 'bagging_freq': 7,
 'min_child_samples': 38}

In [None]:
from sklearn.model_selection import KFold
def cross_val(data,target,model,params):
    kf = KFold(n_splits = 10,shuffle = True,random_state = 2021)
    for fold, (train_idx,test_idx) in enumerate(kf.split(data,target)):
        print(f"Fold: {fold}")
        x_train, y_train = data.iloc[train_idx], target.iloc[train_idx]
        x_test, y_test = data.iloc[test_idx], target.iloc[test_idx]

        alg = model(**params,random_state = 2021)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
        pred = alg.predict(x_test)
        error = mean_squared_error(y_test, pred,squared = False)
        print(f" mean_squared_error: {error}")
        print("-"*50)
    
    return alg

In [None]:
lgb_model = cross_val(X,y,LGBMRegressor,lgb_params)

In [None]:
import pickle
# save the model to disk
filename = 'finalized_model.pkl'
pickle.dump(lgb_model, open(filename, 'wb'))