In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.impute import SimpleImputer
np.random.seed(34)

arrhythmia_df = pd.read_csv('1_data_exploration_arrhythmia.csv')
print(arrhythmia_df.shape)
# let's level the playing field by collapsing classes 2-16 into one label - Arrhythmia
sns.histplot(arrhythmia_df['Class_binary'])

# let's stick with the best features output from part 1
X = arrhythmia_df.drop(columns=['Class', 'Class_binary', 'Sex_categorical'], inplace=False)
#X = SimpleImputer(strategy='most_frequent').fit_transform(X)
y = label_binarize(arrhythmia_df['Class_binary'], classes=['No Arrhythmia', 'Arrhythmia']).ravel()

arrhythmia_df.head()

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer, recall_score


# since Arrhythmia is a serious condition, I want to evaluate classification by the recall score
recall_scorer = make_scorer(recall_score, average='weighted')

best_feature_indices = []
for clf in [LogisticRegression(), LinearSVC()]:
    
    selector = RFECV(estimator=clf, cv=5, step=1, scoring=recall_scorer)
    scaler = StandardScaler()
    selector.fit(scaler.fit_transform(X), y)
    
    clf_selector_ranking = selector.ranking_
    rank1_feature_indices = np.where(selector.ranking_ >= 2)[0]
    
    best_feature_indices.append(rank1_feature_indices)

features_indices_intersection = np.intersect1d(best_feature_indices[0], best_feature_indices[1])
print(features_indices_intersection)
#features_to_keep = np.array(kbest_feature_names)[features_indices_intersection]
# we've removed a majority of less important, highly correlated features
#print(set(highly_correlated_features).intersection(features_to_keep))

#X_new = X[features_to_keep]
#plt.imshow(np.cov(StandardScaler().fit_transform(X_new), rowvar=False), aspect='auto', vmin=-1, vmax=1, cmap='RdBu_r', origin='lower')

Upon viewing the features' covariance matrix, I considered PCA. With feature elimination removing a chunk of the highly correlated features, we should first investigate classification performance and use PCA as an alternative in the case that we are unsatisfied...

In [None]:
from sklearn.model_selection import train_test_split


# let's begin setting the stage for classification
split_dict = {'test_size': 0.25, 'shuffle': True}

X_train, X_test, y_train, y_test = train_test_split(X, y, **split_dict)

# let's also hyper-parameterize the classifier(s)
param_grid = {'clf__dual': [True, False], 'clf__tol': [1e-3, 1e-4, 1e-5], 'clf__C': [0.1, 1, 10],
                     'clf__class_weight': [None, 'balanced']}
gridsearch_params = {'param_grid': param_grid, 'scoring': recall_scorer,
                     'return_train_score': True}

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


svc_pipe = Pipeline([('sc', StandardScaler()), ('clf', LinearSVC())])
lr_pipe = Pipeline([('sc', StandardScaler()), ('clf', LogisticRegression())])

#svc_pipe = Pipeline([('clf', LinearSVC())])
#lr_pipe = Pipeline([('clf', LogisticRegression())])

#svc_pipe = Pipeline([('sc', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LinearSVC())])
#lr_pipe = Pipeline([('sc', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LogisticRegression())])

#rf_pipe = Pipeline([('sc', StandardScaler()), ('clf')])

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(recall_score(y_test, y_pred, average='weighted'))

pipeline_dict = {'svc': svc_pipe, 'lr': lr_pipe}

for classifier_variation, estimator in pipeline_dict.items():
    
    search = GridSearchCV(estimator=estimator, **gridsearch_params)
    print(classifier_variation)
    search.fit(X_train, y_train)
    #best = search.best_estimator_
    #best.fit(X_train, y_train)
    y_pred = search.predict(X_test)
    print(recall_score(y_test, y_pred, average='weighted'))
        

    
