# Keep it simple, stupid


I wanted to test a "simpler" model against the popular and powerful XGBoost on the heart-attack prediction dataset, which is around 300 samples. The below searches over a small parameter grid for Logistic Regression and XGBoost to draw out some good scores.

XGBoost did not outperform  for this problem over the parameters; AUC was similar and accuracy was slightly worse. Logistic Regression wins this round, beating XGBoost out in interpretability and training speed. Gradient boosting could be worthwile if done over larger and more complex data, or if hyperparameters are comprehensively searched.

A good reminder that there is no free lunch and that, for small datasets, it may be best to KISS :)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression


In [None]:
df_train = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df_train.head(10)

In [None]:
df_train.describe().transpose()

In [None]:
df_train["output"].value_counts()

In [None]:
X = df_train.drop(columns=["output"], axis=1)
y = df_train["output"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

In [None]:
logistic_pipeline = Pipeline([
    ('standard_scaler', StandardScaler()), 
    ('model', LogisticRegression())
])

log_param_grid = {
     'model__penalty' : ['l1', 'l2'],
    'model__C' : np.logspace(-4, 4, 20),
    'model__solver' : ['liblinear','lbfgs']
}

log_grid = GridSearchCV(logistic_pipeline, log_param_grid, cv=5, n_jobs=-1, scoring='roc_auc')

log_grid.fit(X_train, y_train)

In [None]:
xgb_pipeline = Pipeline([
    ('standard_scaler', StandardScaler()), 
    ('model', XGBClassifier())
])

xgb_param_grid = {
    'model__min_child_weight': [1,2],
    'model__max_depth': [3,5,7],
    'model__gamma':[0.5,1,3]
}

xgb_grid = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=5, n_jobs=-1, scoring='roc_auc')

xgb_grid.fit(X_train, y_train)

In [None]:
#LR CV score
mean_score = log_grid.cv_results_["mean_test_score"][log_grid.best_index_]
std_score = log_grid.cv_results_["std_test_score"][log_grid.best_index_]

print(f"Best LR parameters: {log_grid.best_params_}")
print(f"Mean LR CV score: {mean_score: .6f}")
print(f"Standard deviation of LR CV score: {std_score: .6f}")

print("###############################################################")

#XGB CV score
mean_score = xgb_grid.cv_results_["mean_test_score"][xgb_grid.best_index_]
std_score = xgb_grid.cv_results_["std_test_score"][xgb_grid.best_index_]

print(f"Best XGB parameters: {xgb_grid.best_params_}")
print(f"Mean XGB CV score: {mean_score: .6f}")
print(f"Standard deviation of XGB CV score: {std_score: .6f}")

In [None]:
#LR test score
log_y_pred = log_grid.predict(X_test)
log_y_proba = log_grid.predict_proba(X_test)[:,1]

print('LR accuracy score: {0:0.4f}'.format(accuracy_score(y_test, log_y_pred)))
log_auc = roc_auc_score(y_test, log_y_proba)
print('LR AUC: %.3f' % log_auc)
      
      
#XGB test score     
xgb_y_pred = xgb_grid.predict(X_test)
xgb_y_proba = xgb_grid.predict_proba(X_test)[:,1]

print('XGBoost model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, xgb_y_pred)))
xgb_auc = roc_auc_score(y_test, xgb_y_proba)
print('XGB AUC: %.3f' % xgb_auc)
