In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [4]:
df.shape

(1338, 7)

In [5]:
# df.drop_duplicates(inplace=True)

In [6]:
df.shape

(1338, 7)

In [7]:
df.rename(columns={'sex': 'gender'}, inplace=True)

In [8]:
df.dropna()

Unnamed: 0,age,gender,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [9]:
from sklearn.model_selection import train_test_split
X = df.drop(columns='expenses',axis=1)
y = df[['expenses']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
y_test

Unnamed: 0,expenses
764,9095.07
887,5272.18
890,29330.98
1293,9301.89
259,33750.29
...,...
109,47055.53
575,12222.90
535,6067.13
543,63770.43


In [11]:
Y_df = y_train.reset_index().drop('index', axis=1)
y_test = y_test.reset_index().drop('index', axis=1)

In [12]:
Y_df

Unnamed: 0,expenses
0,9193.84
1,8534.67
2,27117.99
3,8596.83
4,12475.35
...,...
1065,4561.19
1066,8582.30
1067,11931.13
1068,46113.51


In [13]:
X_df = X_train.reset_index().drop('index', axis=1)
X_test = X_test.reset_index().drop('index', axis=1)
X_df

Unnamed: 0,age,gender,bmi,children,smoker,region
0,46,female,20.0,2,no,northwest
1,47,female,24.3,0,no,northeast
2,52,female,24.9,0,no,southeast
3,39,female,34.3,5,no,southeast
4,54,female,21.5,3,no,northwest
...,...,...,...,...,...,...
1065,18,female,31.4,4,no,northeast
1066,39,female,23.9,5,no,southeast
1067,58,male,25.2,0,no,northeast
1068,37,female,47.6,2,yes,southwest


In [14]:
data_point = ['northwest', 'male', 'no', 46, 22.0]

In [15]:
X_df.select_dtypes(include='object')

Unnamed: 0,gender,smoker,region
0,female,no,northwest
1,female,no,northeast
2,female,no,southeast
3,female,no,southeast
4,female,no,northwest
...,...,...,...
1065,female,no,northeast
1066,female,no,southeast
1067,male,no,northeast
1068,female,yes,southwest


In [16]:
from sklearn.preprocessing import LabelEncoder, FunctionTransformer

region_column = X_df['region']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the data
encoded_gender = label_encoder.fit(region_column)

region_df = pd.DataFrame(encoded_gender.transform(region_column), columns=['region'])

In [17]:
region_df

Unnamed: 0,region
0,1
1,0
2,2
3,2
4,1
...,...
1065,0
1066,2
1067,0
1068,3


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Splitting the data into features and target
One_hot_df = X_df[['gender', 'smoker']]  
data = X_df[['age', 'bmi']]

# Define preprocessing steps for each type of column
categorical_features = ['gender', 'smoker']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])

num_features = ['age', 'bmi']
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create a ColumnTransformer to apply different preprocessing steps to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', num_transformer, num_features)
    ])

# Fit and transform the data
transformed_data = preprocessor.fit_transform(X_df[['gender', 'smoker', 'age', 'bmi']])

# Get the feature names after transformation
feature_names = preprocessor.get_feature_names_out()

# Convert the transformed data to a DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=feature_names)

transformed_df

Unnamed: 0,cat__gender_male,cat__smoker_yes,num__age,num__bmi
0,0.0,0.0,0.472227,-1.748572
1,0.0,0.0,0.543313,-1.036704
2,0.0,0.0,0.898745,-0.937373
3,0.0,0.0,-0.025379,0.618804
4,0.0,0.0,1.040918,-1.500246
...,...,...,...,...
1065,0.0,0.0,-1.518194,0.138707
1066,0.0,0.0,-0.025379,-1.102924
1067,1.0,0.0,1.325264,-0.887708
1068,0.0,1.0,-0.167551,2.820630


In [19]:
def label_encode_column(column):
    le = LabelEncoder()
    return le.fit_transform(column)


In [20]:
region_features = ['region']
region_transformer = Pipeline(steps=[
    ('label', FunctionTransformer(lambda x: label_encode_column(x['region']).reshape(-1, 1), validate=False))
])

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('region', region_transformer, region_features),
        ('cat', categorical_transformer, categorical_features),
        ('num', num_transformer, num_features)
    ])

In [27]:
transfor_dt = preprocessor.fit_transform(X_df[['region', 'gender', 'smoker', 'age', 'bmi']])
transfor_dt

array([[ 1.        ,  0.        ,  0.        ,  0.47222651, -1.74857248],
       [ 0.        ,  0.        ,  0.        ,  0.54331294, -1.03670397],
       [ 2.        ,  0.        ,  0.        ,  0.8987451 , -0.93737348],
       ...,
       [ 0.        ,  1.        ,  0.        ,  1.3252637 , -0.88770823],
       [ 3.        ,  0.        ,  1.        , -0.16755139,  2.82063006],
       [ 3.        ,  1.        ,  0.        ,  1.1120044 , -0.1096194 ]])

In [23]:
def get_feature_names(preprocessor, input_features):
    output_feature_names = []
    for name, transformer, features in preprocessor.transformers_:
        if name != 'remainder':
            if isinstance(transformer, Pipeline):
                # Get the actual transformer from the pipeline
                transformer = transformer.named_steps[transformer.steps[-1][0]]
            if isinstance(transformer, OneHotEncoder):
                # Get feature names for OneHotEncoder
                transformed_feature_names = transformer.get_feature_names_out(features)
                output_feature_names.extend(transformed_feature_names)
            elif isinstance(transformer, StandardScaler):
                # StandardScaler does not change feature names
                output_feature_names.extend(features)
            elif isinstance(transformer, FunctionTransformer):
                # FunctionTransformer should ideally not change feature names but we provide a generic name
                output_feature_names.extend(features)
    return output_feature_names

# Get the feature names
# feature_names = get_feature_names(preprocessor, X_df.columns)
# print("Feature Names: ", feature_names)

# Get the feature names
feature_names = get_feature_names(preprocessor, X_df[['region', 'gender', 'smoker', 'age', 'bmi']].columns)
print(feature_names)

['region', 'gender_male', 'smoker_yes', 'age', 'bmi']


## Single point transformation

In [25]:
single_data_point = {
    'region': ['north'],
    'gender': ['male'],
    'smoker': ['yes'],
    'age': [25],
    'bmi': [24.0]
}
single_df = pd.DataFrame(single_data_point)
tr=preprocessor.transform(single_df)
tr

array([[ 0.        ,  1.        ,  1.        , -1.02058858, -1.08636921]])

In [38]:
data_point = np.append(tr, 3)

In [28]:
transformed_df = pd.DataFrame(transfor_dt, columns=feature_names)
transformed_df

Unnamed: 0,region,gender_male,smoker_yes,age,bmi
0,1.0,0.0,0.0,0.472227,-1.748572
1,0.0,0.0,0.0,0.543313,-1.036704
2,2.0,0.0,0.0,0.898745,-0.937373
3,2.0,0.0,0.0,-0.025379,0.618804
4,1.0,0.0,0.0,1.040918,-1.500246
...,...,...,...,...,...
1065,0.0,0.0,0.0,-1.518194,0.138707
1066,2.0,0.0,0.0,-0.025379,-1.102924
1067,0.0,1.0,0.0,1.325264,-0.887708
1068,3.0,0.0,1.0,-0.167551,2.820630


In [26]:
preprocessor.transform(X_test[['region', 'gender', 'smoker', 'age', 'bmi']])

array([[ 0.        ,  0.        ,  0.        ,  0.40114007, -0.88770823],
       [ 1.        ,  0.        ,  0.        , -0.23863782, -0.09306431],
       [ 1.        ,  0.        ,  1.        ,  1.75178229, -0.60627185],
       ...,
       [ 0.        ,  1.        ,  0.        , -0.09646495, -0.42416595],
       [ 2.        ,  0.        ,  1.        ,  1.04091797,  2.7875199 ],
       [ 3.        ,  0.        ,  0.        ,  0.82765867,  0.60224912]])

In [29]:
import pickle
with open('preprocessor_pipeline.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

PicklingError: Can't pickle <function <lambda> at 0x0000028810D18400>: attribute lookup <lambda> on __main__ failed

In [29]:
cat_df = pd.concat([transformed_df, X_df[['children']]], axis=1)

In [30]:
cat_df

Unnamed: 0,region,gender_male,smoker_yes,age,bmi,children
0,1.0,0.0,0.0,0.472227,-1.748572,2
1,0.0,0.0,0.0,0.543313,-1.036704,0
2,2.0,0.0,0.0,0.898745,-0.937373,0
3,2.0,0.0,0.0,-0.025379,0.618804,5
4,1.0,0.0,0.0,1.040918,-1.500246,3
...,...,...,...,...,...,...
1065,0.0,0.0,0.0,-1.518194,0.138707,4
1066,2.0,0.0,0.0,-0.025379,-1.102924,5
1067,0.0,1.0,0.0,1.325264,-0.887708,0
1068,3.0,0.0,1.0,-0.167551,2.820630,2


In [25]:
cat_df.isna().sum()

cat__gender_male    0
cat__smoker_yes     0
num__age            0
num__bmi            0
region              0
children            0
dtype: int64

In [34]:
# Models import 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
# from sklearn.svm import SVR
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
# from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [35]:
linreg = LinearRegression()

In [37]:
linreg.fit(cat_df, Y_df)

In [42]:
data_point.reshape(1, -1)

array([[ 0.        ,  1.        ,  1.        , -1.02058858, -1.08636921,
         3.        ]])

In [43]:
linreg.predict(data_point.reshape(1, -1))



array([[27441.4352716]])

In [39]:
# Model initialization
linreg = LinearRegression()
ridge = Ridge()
lasso = Lasso()
# net = ElasticNet(random_state=0)
polynomial_pipe = make_pipeline(PolynomialFeatures(), LinearRegression())
# svr = SVR()
# decisiontree = DecisionTreeRegressor(random_state=0)
randomforest = RandomForestRegressor(random_state=0)
gradientboost = GradientBoostingRegressor(random_state=0)
# xgbreg = xgb.XGBRegressor(verbosity=0)
# lgbreg = lgb.LGBMRegressor()
# catboost = CatBoostRegressor(silent=True)

In [41]:
# Hyperparameters

"""No hyperparameter for linear regression"""

ridge_params = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
}
lasso_params = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0],
    'selection': ['cyclic', 'random'] 
}
elasticnet_params = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9], 
    'selection': ['cyclic', 'random'] 
}
polynomial_params= {
    'polynomialfeatures__degree': [2, 3, 4]  
}
svr_params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    'degree': [2, 3],  # Degree of the polynomial kernel
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'C': [0.1, 1.0, 10.0],  # Regularization parameter
}
decision_tree_params = {
    'splitter': ['best', 'random'],  
    'max_depth': [None, 5, 10, 20],
    'max_features': ['auto', 'sqrt', 'log2', None]
}
random_forest_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'max_features': ['auto', 'sqrt', 'log2']
}
gradientboost_params = {
    'n_estimators': [100, 200, 300], 
    'learning_rate': [0.05, 0.1, 0.2],  
    'max_depth': [3, 5, 7]
}

# Singly params

xgb_params = {
    'verbosity': 0,
    'colsample_bytree': 0.3,
    'learning_rate': 0.7,
    'max_depth': 15,
    'alpha': 1,
    'n_estimators': 100
}

lgb_params = {
    'verbose': -1,
    'objective': 'regression',  # specify the objective for regression
    'metric': 'mse',  # evaluation metric
    'verbosity': -1,  # suppress output
    'boosting_type': 'gbdt',  # gradient boosting decision tree
    'learning_rate': 0.1,
    'num_leaves': 31,  # maximum number of leaves in one tree
    'max_depth': -1,  # no limit on tree depth
    'n_estimators': 100
}

catboostparams = {
    'silent': True,
    'loss_function': 'RMSE',  # specify the loss function for regression
    'iterations': 100,  # number of boosting iterations
    'learning_rate': 0.1,
    'depth': 16,  # depth of the trees
    'random_seed': 42
}

In [42]:
# Grid search cv
ridge_grid_search = GridSearchCV(estimator=ridge, param_grid=ridge_params, cv=5)
lasso_grid_search = GridSearchCV(estimator=lasso, param_grid=lasso_params, cv=5)
# elasticnet_grid_search = GridSearchCV(estimator=net, param_grid=elasticnet_params, cv=5)
polynomial_grid_search = GridSearchCV(estimator=polynomial_pipe, param_grid=polynomial_params, cv=5)
# svr_grid_search = GridSearchCV(estimator=svr, param_grid=svr_params, cv=5)
# decision_tree_grid_search = GridSearchCV(estimator=decisiontree, param_grid=decision_tree_params, cv=5)
random_forest_grid_search = GridSearchCV(estimator=randomforest, param_grid=random_forest_params, cv=5)
gradient_boost_grid_search = GridSearchCV(estimator=gradientboost, param_grid=gradientboost_params, cv=5)
# xgboost_grid_search = GridSearchCV(estimator=xgbreg, param_grid=xgboost_params, cv=5)
# lgb_grid_search = GridSearchCV(estimator=lgbreg, param_grid=lgb_grid_params, cv=5)
# catboost_grid_search = GridSearchCV(estimator=catboost, param_grid=catboost_grid_params, cv=5)

xgbreg = xgb.XGBRegressor(**xgb_params)
lgbreg = lgb.LGBMRegressor(**lgb_params)
catboostreg = CatBoostRegressor(**catboostparams)

In [43]:
# Fit
import time
t = time.time()
import warnings
warnings.filterwarnings('ignore')

start_time = time.time()
linreg.fit(cat_df, Y_df)
end_time = time.time()
print(f"Time taken for Linear: {(end_time-start_time):.2f} SECONDS\n")

ridge_start_time = time.time()
ridge_grid_search.fit(cat_df, Y_df)
ridge_best_params = ridge_grid_search.best_params_
ridge_best_model = ridge_grid_search.best_estimator_
print("Best hyperparameters for Ridge:", ridge_best_params)
print(f"Best model for ridge:{ridge_best_model}")
ridge_end_time = time.time()
print(f"Time taken for Ridge: {(ridge_end_time-ridge_start_time):.2f} SECONDS\n")

lasso_start_time = time.time()
lasso_grid_search.fit(cat_df, Y_df)
lasso_best_params = lasso_grid_search.best_params_
lasso_best_model = lasso_grid_search.best_estimator_
print("Best hyperparameters for Lasso:", lasso_best_params)
print(f"Best model for lasso:{lasso_best_model}")
lasso_end_time = time.time()
print(f"Time taken for Lasso: {(lasso_end_time-lasso_start_time):.2f} SECONDS\n")

# elasticnet_grid_search.fit(cat_df, Y_df)
# elasticnet_best_params = elasticnet_grid_search.best_params_
# elasticnet_best_model = elasticnet_grid_search.best_estimator_
# print("Best hyperparameters for ElasticNet:", elasticnet_best_params)
# print(f"Best model for ElasticNet:{elasticnet_best_model}")

start_time = time.time()
polynomial_grid_search.fit(cat_df, Y_df)
ploynomial_best_params = polynomial_grid_search.best_params_
polynomial_best_model = polynomial_grid_search.best_estimator_
print("Best hyperparameters for Polynomial:", ploynomial_best_params)
print(f"Best model for Polynomial:{polynomial_best_model}")
end_time = time.time()
print(f"Time taken for Polynomial: {(end_time-start_time):.2f} SECONDS\n")

# svr_grid_search.fit(cat_df, Y_df)
# svr_best_params = svr_grid_search.best_params_
# svr_best_model = svr_grid_search.best_estimator_
# print("Best hyperparameters for SVR:", svr_best_params)
# print(f"Best model for SVR:{svr_best_model}")

# decision_tree_grid_search.fit(cat_df, Y_df)
# decision_tree_best_params = decision_tree_grid_search.best_params_
# decision_tree_best_model = decision_tree_grid_search.best_estimator_
# print("Best hyperparameters for Decision tree:", decision_tree_best_params)
# print(f"Best model for Decision tree:{decision_tree_best_model}")

start_time = time.time()
random_forest_grid_search.fit(cat_df, Y_df)
random_forest_best_params = random_forest_grid_search.best_params_
random_forest_best_model = random_forest_grid_search.best_estimator_
print("Best hyperparameters for Random forest:", random_forest_best_params)
print(f"Best model for Random forest:{random_forest_best_model}")
end_time = time.time()
print(f"Time taken for Random Forest: {(end_time-start_time):.2f} SECONDS\n")

start_time = time.time()
gradient_boost_grid_search.fit(cat_df, Y_df)
gradient_boost_best_params = gradient_boost_grid_search.best_params_
gradient_boost_best_model = gradient_boost_grid_search.best_estimator_
print("Best hyperparameters for Gradient Boost:", gradient_boost_best_params)
print(f"Best model for Gradient Boost:{gradient_boost_best_model}")
end_time = time.time()
print(f"Time taken for Gradient Boost: {(end_time-start_time):.2f} SECONDS\n")

# start_time = time.time()
# xgboost_grid_search.fit(cat_df, Y_df)
# xgboost_best_params = xgboost_grid_search.best_params_
# xgboost_best_model = xgboost_grid_search.best_estimator_
# print("Best hyperparameters for XG Boost:", xgboost_best_params)
# print(f"Best model for XG Boost:{xgboost_best_model}")
# end_time = time.time()
# print(f"Time taken for XG Boost: {end_time-start_time}SECONDS")

# lgb_grid_search.fit(cat_df, Y_df)
# lgb_best_params = lgb_grid_search.best_params_
# lgb_best_model = lgb_grid_search.best_estimator_
# print("Best hyperparameters for LGB:", lgb_best_params)
# print(f"Best model for LGB:{lgb_best_model}")

# start_time = time.time()
# catboost_grid_search.fit(cat_df, Y_df)
# catboost_best_params = catboost_grid_search.best_params_
# catboost_best_model = catboost_grid_search.best_estimator_
# print("Best hyperparameters for Catboost:", catboost_best_params)
# print(f"Best model for Catboost:{catboost_best_model}")
# end_time = time.time()
# print(f"Time taken for Cat Boost: {end_time-start_time}SECONDS")

start_time = time.time()
xgbreg.fit(cat_df, Y_df)
end_time = time.time()
print(f"Time taken for XG Boost: {(end_time-start_time):.2f} SECONDS\n")


start_time = time.time()
lgbreg.fit(cat_df, Y_df)
end_time = time.time()
print(f"Time taken for LGB: {(end_time-start_time):.2f} SECONDS\n")


start_time = time.time()
catboostreg.fit(cat_df, Y_df)
end_time = time.time()
print(f"Time taken for CatBoost: {(end_time-start_time):.2f} SECONDS\n")

y=time.time()
print(y-t)

Best hyperparameters for Ridge: {'alpha': 1.0, 'solver': 'sag'}
Best model for ridge:Ridge(solver='sag')
Time taken for Ridge: 3.884105682373047SECONDS
Best hyperparameters for Lasso: {'alpha': 10.0, 'selection': 'random'}
Best model for lasso:Lasso(alpha=10.0, selection='random')
Time taken for Lasso: 0.8590822219848633SECONDS
Best hyperparameters for Polynomial: {'polynomialfeatures__degree': 2}
Best model for Polynomial:Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])
Time taken for Polynomial: 1.0887155532836914SECONDS
Best hyperparameters for Random forest: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 300}
Best model for Random forest:RandomForestRegressor(max_depth=10, max_features='sqrt', n_estimators=300,
                      random_state=0)
Time taken for Random Forest: 214.01729607582092SECONDS
Best hyperparameters for Gradient Boost: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 1

In [50]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

In [56]:
# print(f"Linear regression on training set MAE: {mae(Y_df, linreg.predict(cat_df)):.2f}")
print(f"Ridge regression on training set MAE: {mae(Y_df, ridge_grid_search.predict(cat_df)):.2f}")
print(f"Lasso regression on training set MAE: {mae(Y_df, lasso_grid_search.predict(cat_df)):.2f}")
# print(f"ElasticNet regression on training set MAE: {mae(Y_df, net.predict(cat_df))}")
print(f"Polynomial regression on training set MAE: {mae(Y_df, polynomial_grid_search.predict(cat_df)):.2f}")
# print(f"SVR regression on training set MAE: {mae(Y_df, svr.predict(cat_df))}")
# print(f"DecisionTree regression on training set MAE: {mae(Y_df, decisiontreereg.predict(cat_df))}") # Not useful
print(f"Random Forest regression on training set MAE: {mae(Y_df, random_forest_grid_search.predict(cat_df)):.2f}")
print(f"Gradient Boosting regression on training set MAE: {mae(Y_df, gradient_boost_grid_search.predict(cat_df)):.2f}")
# print(f"Ada Boosting regression on training set MAE: {mae(Y_df, adaboost.predict(cat_df))}")
print(f"XG Boost regression on training set MAE: {mae(Y_df, xgbreg.predict(cat_df)):.2f}")
print(f"LightGBM regression on training set MAE: {mae(Y_df, lgbreg.predict(cat_df)):.2f}")
print(f"Catboost regression on training set MAE: {mae(Y_df, catboostreg.predict(cat_df)):.2f}")
# print(f"K-Nearest Neighbour regression on training set MAE: {mae(Y_df, knnreg.predict(cat_df))}")
# print(f"Random Forest regression on training set MAE: {mae(Y_df, random_forest_grid_search.predict(cat_df)):.2f}")

Ridge regression on training set MAE: 4217.58
Lasso regression on training set MAE: 4210.37
Polynomial regression on training set MAE: 2938.44
Random Forest regression on training set MAE: 1424.73
Gradient Boosting regression on training set MAE: 2294.19
XG Boost regression on training set MAE: 3939.74
LightGBM regression on training set MAE: 1748.54
Catboost regression on training set MAE: 1040.09


In [57]:
# print(f"Linear regression on training set MAE: {r2_score(Y_df, linreg.predict(cat_df)):.2f}")
print(f"Ridge regression on training set MAE: {r2_score(Y_df, ridge_grid_search.predict(cat_df)):.2f}")
print(f"Lasso regression on training set MAE: {r2_score(Y_df, lasso_grid_search.predict(cat_df)):.2f}")
# print(f"ElasticNet regression on training set MAE: {r2_score(Y_df, net.predict(cat_df))}")
print(f"Polynomial regression on training set MAE: {r2_score(Y_df, polynomial_grid_search.predict(cat_df)):.2f}")
# print(f"SVR regression on training set MAE: {r2_score(Y_df, svr.predict(cat_df))}")
# print(f"DecisionTree regression on training set MAE: {r2_score(Y_df, decisiontreereg.predict(cat_df))}") # Not useful
print(f"Random Forest regression on training set MAE: {r2_score(Y_df, random_forest_grid_search.predict(cat_df)):.2f}")
print(f"Gradient Boosting regression on training set MAE: {r2_score(Y_df, gradient_boost_grid_search.predict(cat_df)):.2f}")
# print(f"Ada Boosting regression on training set MAE: {r2_score(Y_df, adaboost.predict(cat_df))}")
print(f"XG Boost regression on training set MAE: {r2_score(Y_df, xgbreg.predict(cat_df)):.2f}")
print(f"LightGBM regression on training set MAE: {r2_score(Y_df, lgbreg.predict(cat_df)):.2f}")
print(f"Catboost regression on training set MAE: {r2_score(Y_df, catboostreg.predict(cat_df)):.2f}")
# print(f"K-Nearest Neighbour regression on training set MAE: {r2_score(Y_df, knnreg.predict(cat_df))}")
# print(f"Random Forest regression on training set MAE: {r2_score(Y_df, random_forest_grid_search.predict(cat_df)):.2f}")

Ridge regression on training set MAE: 0.74
Lasso regression on training set MAE: 0.74
Polynomial regression on training set MAE: 0.84
Random Forest regression on training set MAE: 0.96
Gradient Boosting regression on training set MAE: 0.88
XG Boost regression on training set MAE: 0.80
LightGBM regression on training set MAE: 0.94
Catboost regression on training set MAE: 0.97


In [None]:
import time
t = time.time()

In [None]:

# poly = PolynomialFeatures(2)
# X_poly = poly.fit_transform(cat_df)
# ployreg = LinearRegression()
# ployreg.fit(X_poly, Y_df)


# decisiontreereg = DecisionTreeRegressor(random_state=0)

gradientboost = GradientBoostingRegressor(random_state=0)
adaboost = AdaBoostRegressor(random_state=0, n_estimators=100)


xgb_params = {
    'colsample_bytree': 0.3,
    'learning_rate': 0.7,
    'max_depth': 15,
    'alpha': 1,
    'n_estimators': 100
}
xgbreg = xgb.XGBRegressor(**params)

lgb_params = {
    'objective': 'regression',  # specify the objective for regression
    'metric': 'mse',  # evaluation metric
    'verbosity': -1,  # suppress output
    'boosting_type': 'gbdt',  # gradient boosting decision tree
    'learning_rate': 0.1,
    'num_leaves': 31,  # maximum number of leaves in one tree
    'max_depth': -1,  # no limit on tree depth
    'n_estimators': 100
}
lgbreg = lgb.LGBMRegressor(**lgb_params)


catboostparams = {
    'loss_function': 'RMSE',  # specify the loss function for regression
    'iterations': 100,  # number of boosting iterations
    'learning_rate': 0.1,
    'depth': 16,  # depth of the trees
    'random_seed': 42
}
catboostreg = CatBoostRegressor(**catboostparams)

knnreg = KNeighborsRegressor(n_neighbors=5)

In [None]:
linreg.fit(cat_df, Y_df)
ridge.fit(cat_df, Y_df)
lasso.fit(cat_df, Y_df)
net.fit(cat_df, Y_df)
svr.fit(cat_df, Y_df)
decisiontreereg.fit(cat_df, Y_df)
randomforest.fit(cat_df, Y_df)
gradientboost.fit(cat_df, Y_df)
adaboost.fit(cat_df, Y_df)
xgbreg.fit(cat_df, Y_df)
lgbreg.fit(cat_df, Y_df)
catboostreg.fit(cat_df, Y_df)
knnreg.fit(cat_df, Y_df)

In [None]:
Y_df, lgbreg.predict(cat_df)

In [None]:
# age	gender	bmi	children	smoker	region
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(f"Linear regression MAE: {mean_absolute_error(Y_df, linreg.predict(cat_df))}")
print(f"Ridge regression MAE: {mean_absolute_error(Y_df, ridge.predict(cat_df))}")
print(f"Lasso regression MAE: {mean_absolute_error(Y_df, lasso.predict(cat_df))}")
print(f"ElasticNet regression MAE: {mean_absolute_error(Y_df, net.predict(cat_df))}")
print(f"Polynomial regression MAE: {mean_absolute_error(Y_df, ployreg.predict(X_poly))}")
print(f"SVR regression MAE: {mean_absolute_error(Y_df, svr.predict(cat_df))}")
print(f"DecisionTree regression MAE: {mean_absolute_error(Y_df, decisiontreereg.predict(cat_df))}") # Not useful
print(f"Random Forest regression MAE: {mean_absolute_error(Y_df, randomforest.predict(cat_df))}")
print(f"Gradient Boosting regression MAE: {mean_absolute_error(Y_df, gradientboost.predict(cat_df))}")
print(f"Ada Boosting regression MAE: {mean_absolute_error(Y_df, adaboost.predict(cat_df))}")
print(f"XG Boost regression MAE: {mean_absolute_error(Y_df, xgbreg.predict(cat_df))}")
print(f"LightGBM regression MAE: {mean_absolute_error(Y_df, lgbreg.predict(cat_df))}")
print(f"Catboost regression MAE: {mean_absolute_error(Y_df, catboostreg.predict(cat_df))}")
print(f"K-Nearest Neighbour regression MAE: {mean_absolute_error(Y_df, knnreg.predict(cat_df))}")

In [None]:
!pip install lightgbm

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC  # Replace with your model
from sklearn.metrics import accuracy_score  # Example metric

X = cat_df
y = Y_df

# Define the model (replace with your actual model)
model = SVR()


# Define the parameter grid
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],  
    "kernel": ["linear", "rbf"]  
}

# Perform grid search with cross-validation (corrected)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

# Best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Print the results
print("Best Model:", best_model)
print("Best Parameters:", best_params)


In [None]:
print(f"SVR regression MAE: {mean_absolute_error(Y_df, best_model.predict(cat_df))}")