In [2]:
import matplotlib.pyplot as plt
import pandas as pd

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, train_test_split, RandomizedSearchCV, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler 
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

df = pd.read_csv('data/Cardiovascular_Diseases_Risk_Prediction_Dataset.csv')

y = df['Heart_Disease']
X = df.drop('Heart_Disease', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12, stratify=y)

# Transform Target column to 1's and 0's
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

num_cols = [
    'Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption',
    'Fruit_Consumption', 'Green_Vegetables_Consumption',
    'FriedPotato_Consumption'
]
ord_cols = ['General_Health', 'Checkup', 'Age_Category']
dum_cols = [
    'Exercise', 'Skin_Cancer', 'Depression', 'Arthritis', 'Other_Cancer',
    'Smoking_History', 'Sex', 'Diabetes'
]

health = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
check = [
    'Never', '5 or more years ago', 'Within the past 5 years',
    'Within the past 2 years', 'Within the past year'
]
age = [
    '18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59',
    '60-64', '65-69', '70-74', '75-79', '80+'
]

col_transformer = ColumnTransformer(transformers=[
    ('oe', OrdinalEncoder(categories=[health, check, age]), ord_cols),
    ('ohe', OneHotEncoder(), dum_cols)
],
                                    remainder="passthrough")


def Model_eval(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    score = cross_validate(model,
                           X_train,
                           y_train,
                           scoring=['f1', 'neg_log_loss'],
                           cv=5)
    f1 = score['test_f1'].mean()
    log_loss = -1 * score['test_neg_log_loss'].mean()
    print(f'Accuracy: {acc}')
    print(f'F1 Score: {f1}')
    print(f'Log-loss: {log_loss}')

    y_preds = model.predict(X_test)
    cm = confusion_matrix(y_test, y_preds, normalize='true')
    disp = ConfusionMatrixDisplay(cm)
    fig, ax = plt.subplots(figsize=(3.2, 2.4))
    disp.plot(ax=ax)
    plt.show()


def data_prep(X_train, y_train, X_test, y_test, smote=True, scale=True):
    if smote & scale:
        pipe = ImPipeline(steps=[('ct',col_transformer),
                         ('ss', StandardScaler()),
                         ('sm', SMOTE(random_state=42))])
        X_train_resampled, y_train_resampled = pipe.fit_resample(X_train,y_train)
        X_train_resampled_df = pd.DataFrame(X_train_resampled, columns=pipe.get_feature_names_out())
        X_test_scaled = pd.DataFrame(pipe[:-1].transform(X_test), columns=pipe.get_feature_names_out())
        return X_train_resampled_df, y_train_resampled, X_test_scaled, y_test
    elif smote:
        pipe = ImPipeline(steps=[('ct',col_transformer),
                         ('sm', SMOTE(random_state=42))])
        X_train_resamp, y_train_resamp = pipe.fit_resample(X_train,y_train)
        X_train_resamp_df = pd.DataFrame(X_train_resamp, columns=pipe.get_feature_names_out())
        X_test_transformed = pd.DataFrame(pipe[:-1].transform(X_test), columns=pipe.get_feature_names_out())
        return X_train_resamp_df, y_train_resamp, X_test_transformed, y_test
    elif scale:
        pipe = Pipeline(steps=[('col_transformer', col_transformer),
                               ('ss', StandardScaler())])
        pipe.fit(X_train)
        X_train_scaled = pd.DataFrame(pipe.transform(X_train), 
                                      columns=pipe.get_feature_names_out(),
                                      index=X_train.index)
        X_test_scaled = pd.DataFrame(pipe.transform(X_test), 
                                     columns=pipe.get_feature_names_out(),
                                     index= X_test.index)
        return X_train_scaled, y_train, X_test_scaled, y_test
    else:
        pipe = Pipeline(steps=[('col_transformer', col_transformer)])
        pipe.fit(X_train)
        X_train_transformed = pd.DataFrame(pipe.transform(X_train),
                                columns=pipe.get_feature_names_out(),
                                index=X_train.index)
        X_test_transformed = pd.DataFrame(pipe.transform(X_test),
                                columns=pipe.get_feature_names_out(),
                                index=X_test.index)
        return X_train_transformed, y_train, X_test_transformed, y_test

In [None]:
f1_score(y_test,clf.predict(X_ts))


In [4]:
# RandomForestClassifier 1
rfc_pipe1 = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('rfc', RandomForestClassifier(random_state=12))])
param_grid = {
    'rfc__n_estimators': np.linspace(200, 2000, 10, dtype=int),
    'rfc__criterion': ['gini', 'entropy', 'log_loss'],
    'rfc__max_depth': [None, 2, 5, 10, 30, 50, 70, 90],
    'rfc__min_samples_split': [2, 5, 10], 
    'rfc__min_samples_leaf': [1, 2, 4],
    'rfc__bootstrap' : [True, False],
    'rfc__max_features': [None, 1, 10, 'sqrt'],
    'sm__k_neighbors': [3, 5, 9]} 

gs1 = RandomizedSearchCV(estimator = rfc_pipe1, param_distributions=param_grid, 
                         n_iter=100, scoring='f1', cv=3, n_jobs=-1, random_state= 12)
rsearch1 = gs1.fit(X_train, y_train)
rsearch1.best_params_

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


KeyboardInterrupt: 

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

In [None]:
# RandomForestClassifier 2
rfc_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('rfc', RandomForestClassifier(random_state=12))])

param_grid = {
    'rfc__max_features': [None, 1], # ”sqrt”
} #24 models x 5 = 120 models
gs = RandomizedSearchCV(estimator = rfc_pipe, param_grid=param_grid, scoring='f1', verbose=4, cv=2, n_jobs=3)
gs.fit(X_train, y_train)

In [None]:
# RandomForestClassifier
rfc_pipe = ImPipeline(steps=[('ct', col_transformer),
                             ('sm', SMOTE(random_state=12)),
                             ('ss', StandardScaler()),
                             ('rfc', RandomForestClassifier(random_state=12))])

param_grid = {
    'rfc__n_estimators': [5, 100], # 100
    'rfc__criterion': ['gini', 'entropy', 'log_loss'], # ”gini”
    'rfc__max_depth': [None, 2, 5], # None
    'rfc__min_samples_split': [2], # 2
    'rfc__min_samples_leaf': [1, 5, 10], # 1
    'rfc__max_features': [None, 1, 10, 'sqrt'], # ”sqrt”
    'sm__k_neighbors': [3, 5, 9] # 
} #648 models x 5 = 3240 models
gs = GridSearchCV(rfc_pipe, param_grid, scoring=['f1','neg_log_loss'], cv=5, refit='f1' verbose=4, n_jobs=3)
