In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

In [11]:
# only to test models
# read df
df = pd.read_csv("data/raw/Cardiovascular_Disease_Dataset/Cardiovascular_Disease_Dataset.csv")

# split data
train_df, test_df = train_test_split(df, test_size = 0.2,random_state=123)

X_train = train_df.drop(columns = ['target'])
y_train = train_df['target']
X_test = test_df.drop(columns = ['target'])
y_test = test_df['target']

# define col preprocessor
binary = ['gender','fastingbloodsugar','exerciseangia']
ohe = ['chestpain','restingrelectro']
numerical = ['age','restingBP','serumcholestrol','maxheartrate','oldpeak','noofmajorvessels']
ordinal = ['slope']
drop = ['patientid']

preprocessor = make_column_transformer(
 (StandardScaler(), numerical),
 (OneHotEncoder(), ohe),
 (OrdinalEncoder(), ordinal),
 ('passthrough', binary),
 ('drop', drop)
)

In [12]:
# Function adapted from DSCI 571
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [13]:
models = {
    "dummy clf": DummyClassifier(strategy="most_frequent"),
    "decision tree": DecisionTreeClassifier(random_state=123),
    "RBF SVM": SVC(random_state=123),
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=123)
}

results_dict = {} 

for name, model in models.items():
    pipe = make_pipeline(preprocessor, model)

    results_dict[name] = mean_std_cross_val_scores(
    pipe, X_train, y_train, cv=5, return_train_score=True
    )
    
results_df = pd.DataFrame(results_dict).T
results_df

Unnamed: 0,fit_time,score_time,test_score,train_score
dummy clf,0.007 (+/- 0.005),0.003 (+/- 0.001),0.565 (+/- 0.003),0.565 (+/- 0.001)
decision tree,0.004 (+/- 0.002),0.002 (+/- 0.000),0.954 (+/- 0.014),1.000 (+/- 0.000)
RBF SVM,0.007 (+/- 0.001),0.003 (+/- 0.000),0.959 (+/- 0.009),0.984 (+/- 0.002)
Logistic Regression,0.006 (+/- 0.002),0.002 (+/- 0.000),0.956 (+/- 0.014),0.969 (+/- 0.003)
