### Classification
Notebook to use and test the machine learning models.

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GroupKFold,cross_val_score, cross_validate,RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix,make_scorer, balanced_accuracy_score,recall_score,roc_auc_score, roc_curve
from sklearn.utils.class_weight import compute_sample_weight
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from xgboost import XGBClassifier
import graphviz
import random
import pickle
import os
import gc
from scipy.stats import entropy
from sgkf_split import StratifiedGroupKFold     # Locally defined .py file.
plt.style.use("seaborn")

In [None]:
def print_results(y_test, y_pred, label_names):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred,target_names=label_names))

### Read data

In [None]:
y = np.load(os.path.join('Data','Final', 'y.npy')).astype(int).ravel()  # Convert to 1 and 0 rather than True or False.
ids = np.load(os.path.join('Data','Final', 'ids.npy'))
X = np.load(os.path.join('Data','Final', 'X.npy'))

groups = ids[:,0]

with open(os.path.join('Data','Final', 'col_names.txt')) as f:
    col_names = f.read().splitlines()

### Decision tree with K-fold CV
Use the regular stratified K-fold approach.

In [None]:
mod_basic = DecisionTreeClassifier(max_depth=10, class_weight='balanced', random_state=1)

scoring = {
    'balanced_accuracy': 'balanced_accuracy',
    'sensitivity': 'recall',
    'specificity': make_scorer(recall_score,pos_label=0)
}

mod_basic_output = cross_validate(mod_basic, X, y, cv=5, scoring=scoring, return_estimator=True)
print([mod_basic_output[key] for key in ['test_balanced_accuracy','test_sensitivity','test_specificity']])
with open(os.path.join('Data','Final', 'mod_basic_1.pkl'), 'wb') as f:
    pickle.dump(mod_basic_output, f)

In [None]:
plot_tree(model,max_depth=1)
for i in [5,6,124,12,112,1]:
    print(col_names[i])

In [None]:
tree.export_graphviz(model, out_file='tree_cut.dot', max_depth=2, feature_names=feature_cols, \
                             class_names=['No AF','AF'], filled=True)
!dot -Tpng tree_cut.dot -o tree_cut.png   

### Stratified Group K-Fold Cross Validation

#### Decision Tree
First use a decision tree to compare differences with the two K-fold methods

In [None]:
from sgkf_split import StratifiedGroupKFold     # Locally defined .py file, taken from sklearn Github.

In [None]:
cv = StratifiedGroupKFold(n_splits=5)

mod_grouped = DecisionTreeClassifier(max_depth=10, class_weight='balanced', random_state=15)

scoring = {
    'balanced_accuracy': 'balanced_accuracy',
    'sensitivity': 'recall',
    'specificity': make_scorer(recall_score,pos_label=0)
}

dt_grouped_output = cross_validate(mod_grouped, X, y, groups=groups, cv=cv, scoring=scoring, verbose=3, return_estimator=True)
print([dt_grouped_output[key] for key in ['test_balanced_accuracy','test_sensitivity','test_specificity']])
with open(os.path.join('Data','Final', 'dt_grouped_1.pkl'), 'wb') as f:
    pickle.dump(dt_grouped_output, f)

### XGBoost

#### Exploration
First run a number of models with different parameters to gauge risk of overfitting and the training times.

In [None]:
scale_pos_weight = (len(y)-np.sum(y))/np.sum(y)    # Account for imbalance by weighting positive class by (# Negative)/(# Positive)

xgb = XGBClassifier(learning_rate=0.3, n_estimators=200, objective='binary:logistic', \
                    nthread=1, scale_pos_weight=scale_pos_weight, colsample_bytree=0.5, subsample=0.8)

In [None]:
train_indx, test_indx = next(cv.split(X, y, groups))
train_sub_indx = random.sample(list(train_indx), int(len(train_indx)/4))
X_train, X_test, y_train, y_test = X[train_sub_indx], X[test_indx], y[train_sub_indx], y[test_indx]

In [None]:
eval_set = [(X_train, y_train), (X_test, y_test)]
xgb.fit(X_train, y_train, eval_metric="auc", eval_set=eval_set, verbose=True)

#### XGBoost with Group K-fold

In [None]:
xgb = XGBClassifier(learning_rate=0.3, n_estimators=250, objective='binary:logistic', \
                    nthread=1, scale_pos_weight=scale_pos_weight, colsample_bytree=0.5, subsample=0.8, verbosity=3)

scoring = {
    'balanced_accuracy': 'balanced_accuracy',
    'sensitivity': 'recall',
    'specificity': make_scorer(recall_score,pos_label=0),
    'roc_auc': 'roc_auc'
}

cv_output = cross_validate(xgb, X, y, groups=groups, cv=cv, scoring=scoring, verbose=10, return_estimator=True)
cv_output

In [None]:
# Save output and models to file.
with open(os.path.join('Data','Final', 'XGB_results_1.pkl'), 'wb') as f:
    pickle.dump(cv_output, f)
# with open('XGB_results_1.pkl', 'rb') as f:
#     cv_output_from_file = pickle.load(f)

In [None]:
print('test_balanced_accuracy: ', cv_output['test_balanced_accuracy'])
print('test_sensitivity: \t', cv_output['test_sensitivity'])
print('test_specificity: \t', cv_output['test_specificity'])
print('test_roc_auc: \t\t', cv_output['test_roc_auc'])

#### Model Evaluation
First get more details on one of the models - take the model with median balanced accuracy, which is the second model.

In [None]:
cv_list = list(cv.split(X, y, groups))        # Get all Kfold splits.
xgb_train_idx, xgb_test_idx = cv_list[1]      # Extract the split indices for the second model.

X_test_xgb, y_test_xgb = X[xgb_test_idx], y[xgb_test_idx]

y_pred_xgb = cv_output['estimator'][1].predict(X_test_xgb)    # Get predicted y labels for the test data.
y_pred_xgb_p = cv_output['estimator'][1].predict_proba(X_test_xgb)[:,1]

In [None]:
print_results(y_test_xgb, y_pred_xgb, ['No AF','AF'])    # Get the confusion matrix, precision and recall.

#### ROC Curve

In [None]:
fpr, tpr, thresholds = roc_curve(y_test_xgb, y_pred_xgb_p)

plt.plot(fpr, tpr)
plt.xlim(0,0.02)

plt.xlabel('FPR (1-Specificity)', fontsize=16)
plt.ylabel('TPR (Sensitivity)', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.tick_params(axis='both', which='minor', labelsize=10)

#### Feature Importance
Get average feature importance across the 5 models in the cross validation.

In [None]:
def mean_imp(imp):
    df= pd.DataFrame([cv_output['estimator'][i].get_booster().get_score(importance_type=imp) for i in range(5)])
    return df.mean()

imp_type = ['weight','gain','cover','total_cover','total_gain']

mean_importance_df = pd.DataFrame()

for imp in imp_type:
    mean_importance_df[imp] = mean_imp(imp)

In [None]:
keys = ['f'+str(i) for i in range(len(col_names))]
names_dict = dict(zip(keys, col_names))

mean_importance_df['col_name'] = [names_dict[col] for col in mean_importance_df.index]

In [None]:
top_20 = mean_importance_df.sort_values(by='gain',ascending=False).head(20)

fig,ax= plt.subplots(1,2, sharey=True, figsize=(8,8))

ax[0].barh(top_20['col_name'], top_20['gain'], color='tab:blue', alpha=0.9)
ax[0].set_xlabel('Gain', fontsize=16)
ax[0].set_ylabel('Feature Name', fontsize=16)
ax[0].tick_params(axis='both', which='major', labelsize=12)
ax[0].tick_params(axis='both', which='minor', labelsize=10)

ax[1].barh(top_20['col_name'], top_20['weight'], color='tab:brown', alpha=0.9)
plt.gca().invert_yaxis()
ax[1].set_xlabel('Weight', fontsize=16)
ax[1].tick_params(axis='both', which='major', labelsize=12)
ax[1].tick_params(axis='both', which='minor', labelsize=10)

fig.tight_layout()