# Model Training on Medical Expenses dataset

## Objective: To train and evaluate a model that better fits the dataset.

In [None]:
#pip install catboost

In [None]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import warnings

In [None]:
#importing dataset
df_med  =  pd.read_csv('https://raw.githubusercontent.com/raviteja-padala/Datasets/main/med_exp.csv')
df_med.head()

Unnamed: 0,Age,Gender,Cold,Cough,Fever,BP,Diabetes,Thyroid,Arthritis,Acidity,Others,Expense
0,36,M,no,no,no,yes,yes,no,no,no,no,433
1,37,M,no,no,no,yes,yes,no,no,no,no,427
2,38,M,no,no,no,yes,yes,no,no,no,no,421
3,39,M,no,no,no,yes,yes,no,no,no,no,444
4,40,M,no,no,no,yes,yes,no,no,no,no,427


In [None]:
#creating a copy of dataset before any transformation
df = df_med.copy()

In [None]:
#checking missing values
df_med.isnull().sum()

Age          0
Gender       0
Cold         3
Cough        2
Fever        2
BP           1
Diabetes     0
Thyroid      0
Arthritis    0
Acidity      0
Others       8
Expense      0
dtype: int64

In [None]:
#total null values
df.isna().sum().sum()

16

In [None]:
#we will perform mode imputation of na values so checking mode
df['Others'].mode()

0    no
Name: Others, dtype: object

In [None]:
df['Others'].mode()[0]

'no'

In [None]:
# mode imputation of na values
#df['Cold']= df['Cold'].fillna(df['Others'].mode()[0])
#df['Cough']= df['Cough'].fillna(df['Others'].mode()[0])
#df['Fever']= df['Fever'].fillna(df['Others'].mode()[0])
#df['BP']= df['BP'].fillna(df['Others'].mode()[0])
#df['Others'] = df['Others'].fillna(df['Others'].mode()[0])

In [None]:
#total null values after imputation
df.isna().sum().sum()

16

In [None]:
X = df.drop(columns=['Expense'],axis=1)

In [None]:
X.head()

Unnamed: 0,Age,Gender,Cold,Cough,Fever,BP,Diabetes,Thyroid,Arthritis,Acidity,Others
0,36,M,no,no,no,yes,yes,no,no,no,no
1,37,M,no,no,no,yes,yes,no,no,no,no
2,38,M,no,no,no,yes,yes,no,no,no,no
3,39,M,no,no,no,yes,yes,no,no,no,no
4,40,M,no,no,no,yes,yes,no,no,no,no


In [None]:
y= df['Expense']
y.head()

0    433
1    427
2    421
3    444
4    427
Name: Expense, dtype: int64

In [None]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

In [None]:
num_features

Index(['Age'], dtype='object')

In [None]:
cat_features

Index(['Gender', 'Cold', 'Cough', 'Fever', 'BP', 'Diabetes', 'Thyroid',
       'Arthritis', 'Acidity', 'Others'],
      dtype='object')

In [None]:

num_pipeline= Pipeline(
                steps=[
                ("imputer",SimpleImputer(strategy="median")),
                ("scaler",StandardScaler())

                ])

cat_pipeline=Pipeline(
                steps=[
                ("imputer",SimpleImputer(strategy="most_frequent")),
                ("one_hot_encoder",OneHotEncoder()),
                ("scaler",StandardScaler(with_mean=False))
                ])

preprocessor = ColumnTransformer(
                [
                ("num_pipeline",num_pipeline,num_features),
                ("cat_pipelines",cat_pipeline,cat_features)

                ])

In [None]:
X = preprocessor.fit_transform(X)

In [None]:
X.shape

(799, 21)

In [None]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((639, 21), (160, 21))

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 112.5210
- Mean Absolute Error: 78.3534
- R2 Score: 0.8087
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 85.8012
- Mean Absolute Error: 69.6130
- R2 Score: 0.8533


Lasso
Model performance for Training set
- Root Mean Squared Error: 112.5572
- Mean Absolute Error: 78.3094
- R2 Score: 0.8086
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 85.2011
- Mean Absolute Error: 69.0970
- R2 Score: 0.8553


Ridge
Model performance for Training set
- Root Mean Squared Error: 112.5213
- Mean Absolute Error: 78.3613
- R2 Score: 0.8087
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 85.7973
- Mean Absolute Error: 69.6193
- R2 Score: 0.8533


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 70.9850
- Mean Absolute Error: 28.2344
- R2 Score: 0.9239
------

In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
4,Decision Tree,0.933316
7,CatBoosting Regressor,0.914108
3,K-Neighbors Regressor,0.913759
5,Random Forest Regressor,0.893264
6,XGBRegressor,0.892365
1,Lasso,0.855341
2,Ridge,0.85331
0,Linear Regression,0.853296
8,AdaBoost Regressor,0.799006


In [None]:

# Define the hyperparameter grid for Decision Tree Regressor
dt_param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the hyperparameter grid for CatBoost Regressor
catboost_param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 150, 200]
}

# Define the hyperparameter grid for Random Forest Regressor
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

models = {
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False)
    }

model_list = []
r2_list = []

for i, model_name in enumerate(models):
    model = models[model_name]

    if model_name == "Decision Tree":
        # Perform hyperparameter tuning using GridSearchCV for Decision Tree Regressor
        grid_search = GridSearchCV(model, param_grid=dt_param_grid, cv=5, n_jobs=-1)
    elif model_name == "CatBoosting Regressor":
        # Perform hyperparameter tuning using GridSearchCV for CatBoost Regressor
        model = CatBoostRegressor(verbose=False)
        grid_search = GridSearchCV(model, param_grid=catboost_param_grid, cv=5, n_jobs=-1)
    elif model_name == "Random Forest Regressor":
        # Perform hyperparameter tuning using GridSearchCV for Random Forest Regressor
        grid_search = GridSearchCV(model, param_grid=rf_param_grid, cv=5, n_jobs=-1)

    grid_search.fit(X_train, y_train)
    model = grid_search.best_estimator_  # Reassign the model with the best estimator found by GridSearchCV

    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(model_name)
    model_list.append(model_name)

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print('=' * 35)
    print('\n')


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 79.9330
- Mean Absolute Error: 39.5931
- R2 Score: 0.9035
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 62.3282
- Mean Absolute Error: 32.9264
- R2 Score: 0.9226


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 48.6906
- Mean Absolute Error: 19.7055
- R2 Score: 0.9642
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 67.9631
- Mean Absolute Error: 26.7847
- R2 Score: 0.9080


CatBoosting Regressor
Model performance for Training set
- Root Mean Squared Error: 57.0100
- Mean Absolute Error: 30.2659
- R2 Score: 0.9509
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 64.5036
- Mean Absolute Error: 32.1090
- R2 Score: 0.9171




In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,Decision Tree,0.922585
2,CatBoosting Regressor,0.917087
1,Random Forest Regressor,0.907955


* R2 score is better with default parameters.

* The reasons why the R2 score might be better without hyperparameters could be:

> Default Settings: Some machine learning algorithms, like the Decision Tree or Random Forest, have reasonable default hyperparameter settings. The default settings may be well-suited for the data and do not require further tuning.

> Data Characteristics: The performance of a model heavily depends on the characteristics of the dataset. In some cases, the data may have clear patterns or separability, making hyperparameter tuning less critical for achieving good results.

> Data Size: Hyperparameter tuning becomes more crucial when dealing with large and complex datasets. For smaller datasets, the default settings might be sufficient to obtain good performance.

> Limited Hyperparameter Grid: The hyperparameter grid used for tuning might be limited, and the best combination of hyperparameters may not be explored effectively.

> Overfitting: Aggressive hyperparameter tuning can lead to overfitting on the training data, resulting in poor generalization to unseen data.

It is important to note that hyperparameter tuning is not a one-size-fits-all solution and should be performed carefully. Sometimes, the default hyperparameters or minimal tuning can be sufficient to achieve satisfactory results.

Ultimately, the choice of whether to use hyperparameter tuning or not depends on the specific problem, dataset size, and characteristics. It is good practice to experiment with different hyperparameter settings and compare the results to determine the most suitable approach for each scenario.


**Thank you for reading all the way to the end.**