In [82]:
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [87]:
df_900 = pd.read_csv('dataset/medical900.csv')
df_1300 = pd.read_csv('dataset/medical1300.csv')

In [84]:
df_1300.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [85]:
df_900.shape, df_1300.shape

((986, 11), (1338, 7))

In [81]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# import xgboost as xgb
# from sklearn.metrics import mean_absolute_error, mean_squared_error

# # Sample DataFrame
# data = pd.DataFrame({
#     'age': [19, 18, 28, 33, 32],
#     'sex': ['female', 'male', 'male', 'male', 'male'],
#     'bmi': [27.9, 33.77, 33.0, 22.705, 28.88],
#     'children': [0, 1, 3, 0, 0],
#     'smoker': ['yes', 'no', 'no', 'no', 'no'],
#     'region': ['southwest', 'southeast', 'southeast', 'northwest', 'northwest'],
#     'charges': [16884.924, 1725.5523, 4449.462, 21984.47061, 3866.8552]
# })

# # Separate features and target
# X = data.drop('charges', axis=1)
# y = data['charges']

# # Identify categorical columns
# categorical_cols = ['sex', 'smoker', 'region']

# # Define the ColumnTransformer with OneHotEncoder for categorical columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
#     ],
#     remainder='passthrough'  # Keep other columns as they are
# )

# # Define the pipeline
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5))
# ])

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train the model
# pipeline.fit(X_train, y_train)

# # Make predictions
# y_pred = pipeline.predict(X_test)

# # Evaluate the model
# mae = mean_absolute_error(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred, squared=False)

# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Root Mean Squared Error (RMSE): {rmse}")


Mean Absolute Error (MAE): 2732.7675242187497
Root Mean Squared Error (RMSE): 2732.7675242187497


In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Sample DataFrame
data = df_1300.copy()

# Separate features and target
X = data.drop('charges', axis=1)
y = data['charges']

# Identify categorical columns
categorical_cols = ['sex', 'smoker', 'region']
numerical_cols = ['age', 'bmi', 'children']

# Handle Outliers
# For simplicity, let's use the IQR method to identify outliers in 'charges' and cap them
Q1 = data['charges'].quantile(0.25)
Q3 = data['charges'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data['charges'] = np.where(data['charges'] < lower_bound, lower_bound, data['charges'])
data['charges'] = np.where(data['charges'] > upper_bound, upper_bound, data['charges'])

# Re-separate features and target after outlier handling
X = data.drop('charges', axis=1)
y = data['charges']

# Feature Engineering: Polynomial Features for numerical data
polynomial_transformer = PolynomialFeatures(degree=2, include_bias=False)

# Define the ColumnTransformer with OneHotEncoder for categorical columns and StandardScaler for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', polynomial_transformer, numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # Keep other columns as they are
)

# Model Pipelines
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb.XGBRegressor(objective='reg:squarederror'))
])

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

pipeline_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_distributions_xgb = {
    'model__n_estimators': np.arange(50, 301, 50),
    'model__learning_rate': np.logspace(-3, 0, 10),
    'model__max_depth': np.arange(3, 10, 1),
    'model__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

param_distributions_rf = {
    'model__n_estimators': np.arange(50, 301, 50),
    'model__max_depth': np.arange(3, 20, 2),
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

param_distributions_gb = {
    'model__n_estimators': np.arange(50, 301, 50),
    'model__learning_rate': np.logspace(-3, 0, 10),
    'model__max_depth': np.arange(3, 10, 1),
    'model__subsample': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# Randomized search for hyperparameter tuning
random_search_xgb = RandomizedSearchCV(estimator=pipeline_xgb, param_distributions=param_distributions_xgb, n_iter=100, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, random_state=42)
random_search_rf = RandomizedSearchCV(estimator=pipeline_rf, param_distributions=param_distributions_rf, n_iter=100, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, random_state=42)
random_search_gb = RandomizedSearchCV(estimator=pipeline_gb, param_distributions=param_distributions_gb, n_iter=100, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, random_state=42)

# Fit the random search to the data
random_search_xgb.fit(X_train, y_train)
random_search_rf.fit(X_train, y_train)
random_search_gb.fit(X_train, y_train)

# Best parameters
best_params_xgb = random_search_xgb.best_params_
best_params_rf = random_search_rf.best_params_
best_params_gb = random_search_gb.best_params_

print(f"Best parameters for XGB: {best_params_xgb}")
print(f"Best parameters for RF: {best_params_rf}")
print(f"Best parameters for GB: {best_params_gb}")

# Use the best models
best_model_xgb = random_search_xgb.best_estimator_
best_model_rf = random_search_rf.best_estimator_
best_model_gb = random_search_gb.best_estimator_

# Make predictions
y_pred_xgb = best_model_xgb.predict(X_test)
y_pred_rf = best_model_rf.predict(X_test)
y_pred_gb = best_model_gb.predict(X_test)

# Evaluate the best models
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)

mae_gb = mean_absolute_error(y_test, y_pred_gb)
rmse_gb = mean_squared_error(y_test, y_pred_gb, squared=False)

print(f"Mean Absolute Error (MAE) for XGB: {mae_xgb}")
print(f"Root Mean Squared Error (RMSE) for XGB: {rmse_xgb}")

print(f"Mean Absolute Error (MAE) for RF: {mae_rf}")
print(f"Root Mean Squared Error (RMSE) for RF: {rmse_rf}")

print(f"Mean Absolute Error (MAE) for GB: {mae_gb}")
print(f"Root Mean Squared Error (RMSE) for GB: {rmse_gb}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters for XGB: {'model__subsample': 0.9, 'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.046415888336127774, 'model__colsample_bytree': 0.8}
Best parameters for RF: {'model__n_estimators': 300, 'model__min_samples_split': 2, 'model__min_samples_leaf': 2, 'model__max_depth': 3, 'model__bootstrap': True}
Best parameters for GB: {'model__subsample': 0.6, 'model__n_estimators': 250, 'model__max_depth': 3, 'model__learning_rate': 0.01}
Mean Absolute Error (MAE) for XGB: 2286.0895941911144
Root Mean Squared Error (RMSE) for XGB: 4036.520034448741
Mean Absolute Error (MAE) for RF: 2281.260385364732
Root Mean Squared Error (RMSE) for RF: 4109.312165684919
Mean Absolute Error (MAE) for GB: 2651.4728697998285
Root Mean Squared Error (RMSE) for GB: 4196.074184386716


In [95]:
# Baseline model: Mean Predictor
y_mean_pred = np.mean(y_train)

# Calculate MAE and RMSE for the baseline model
baseline_mae = mean_absolute_error(y_test, [y_mean_pred] * len(y_test))
baseline_rmse = mean_squared_error(y_test, [y_mean_pred] * len(y_test), squared=False)

print(f"Baseline Mean Absolute Error (MAE): {baseline_mae}")
print(f"Baseline Root Mean Squared Error (RMSE): {baseline_rmse}")


Baseline Mean Absolute Error (MAE): 8454.806521277427
Baseline Root Mean Squared Error (RMSE): 10428.199030100852
