# Fit and export the sleep staging classifier

In [None]:
import os
import glob
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
sns.set(font_scale=1.25)

# Define path
parent_dir = os.getcwd()
wdir = parent_dir + '/output/features/'
wdir_demo = parent_dir + '/output/demo/'
outdir = parent_dir + "/output/classifiers/"
assert os.path.isdir(wdir)
assert os.path.isdir(wdir_demo)
assert os.path.isdir(outdir)

## Load the feature files

**Method 1: Loop across all datasets**

In [None]:
# feat_files = glob.glob(wdir + "features_nsrr*.parquet")

# df = []
# for f in tqdm(feat_files):
#     # Load current file
#     print(f)
#     tmp = pd.read_parquet(f)
#     # Convert dtypes and downcast float
#     tmp['age'] = tmp['age'].astype('int8')
#     tmp['male'] = tmp['male'].astype('category')
#     cols_float = tmp.select_dtypes(np.float64).columns.tolist()
#     tmp[cols_float] = tmp[cols_float].astype(np.float32)
#     # Append to main dataframe and delete tmp
#     df.append(tmp)
#     del tmp
    
# df = pd.concat(df)
# print("There are %i unique nights" % df.index.get_level_values(0).nunique())
# df.head().round(2)

In [None]:
# Save the concatenated file
# df.to_parquet(wdir + "features_all.parquet")

**Method 2: Concatenated file**

Need to run Method 1 at least once to enable this.

In [None]:
# Or we can simply use a pre-saved concatenated file
df = pd.read_parquet(wdir + "features_all.parquet")

# print("There are %i unique nights" % df.index.get_level_values(0).nunique())
# df.head().round(2)

In [None]:
# Check units of datasets
df.groupby('dataset')['eeg_iqr'].describe().round(2)

### Add demographics (race, BMI)

In [None]:
df_demo = pd.read_csv(wdir_demo + "demo_nsrr_all.csv")
# Remove columns that are already present in `df`
df_demo.drop(columns=['male', 'age'], inplace=True)
df_demo

In [None]:
grp_subj = df.groupby(level=0, as_index=True)[['age', 'male', 'dataset']].first()
grp_subj.reset_index(inplace=True)
# Preprocessing before merge
grp_subj['subj'] = grp_subj['subj'].astype(str)
grp_subj['dataset'] = grp_subj['dataset'].str.upper()
grp_subj['dataset'] = grp_subj['dataset'].replace({'SHHS1': 'SHHS'})
# Left merge to keep only training set
grp_subj = grp_subj.merge(df_demo, how="left")
grp_subj = grp_subj.sort_values(by=['dataset', 'subj']).reset_index(drop=True)
grp_subj

### Descriptive statistics of the training set

In [None]:
# Missing values
grp_subj.isna().sum()

In [None]:
# Number of nights per dataset
grp_subj['dataset'].value_counts(sort=False)

In [None]:
df.shape[0]

In [None]:
# Number of hours / epochs
df.shape[0] / 120

In [None]:
grp_subj['male'].value_counts(normalize=True)

In [None]:
# Plot age distribution
def mean_std(x):
    print(f"{x.mean():.2f} ± {x.std():.2f} (min = {x.min():.2f}, median = {x.median()}, max = {x.max():.2f})")

grp_subj['age'].agg(mean_std)
grp_subj['age'].hist()
plt.xlabel("Age");

In [None]:
grp_subj['ahi'].agg(mean_std)

In [None]:
grp_subj['bmi'].agg(mean_std)

In [None]:
100 * grp_subj['ethnicity'].value_counts(normalize=True).round(3)

### Create different combinations of predictors

In [None]:
cols_all = df.columns
cols_time = cols_all[cols_all.str.startswith('time_')].tolist()
# EEG also includes the time columns
cols_eeg = cols_all[cols_all.str.startswith('eeg_')].tolist() + cols_time  
cols_eog = cols_all[cols_all.str.startswith('eog_')].tolist()
cols_emg = cols_all[cols_all.str.startswith('emg_')].tolist()
cols_demo = ['age', 'male']

In [None]:
# Optional: 5 stages to 4 stages
# If uncommenting, make sure to change the classifier file name!
df['stage'].replace(
    {'N1': 'LIGHT', 'N2': 'LIGHT', 'N3': 'DEEP', 'R': 'REM', 'W': 'WAKE'}, inplace=True)

In [None]:
# Define predictors
X_all = {
    'eeg': df[cols_eeg],
    'eeg+demo': df[cols_eeg + cols_demo],
    'eeg+eog': df[cols_eeg + cols_eog],
    'eeg+eog+demo': df[cols_eeg + cols_eog + cols_demo],
    'eeg+eog+emg': df[cols_eeg + cols_eog + cols_emg],
    'eeg+eog+emg+demo': df[cols_eeg + cols_eog + cols_emg + cols_demo],
}

# Define target and groups
y = df['stage']
subjects = df.index.get_level_values(0).to_numpy()

In [None]:
# Export a full list of features
features = pd.Series(X_all['eeg+eog+emg+demo'].columns, name="Features")
features.to_csv("features.csv", index=False)

In [None]:
# % of each sleep stage
y.value_counts(normalize=True).plot.barh(xlabel="Stage", ylabel="Proportion");

In [None]:
# Define hyper-parameters
params = dict(
    boosting_type='gbdt',
    n_estimators=300,
    max_depth=7,
    num_leaves=70,
    colsample_bytree=0.8,
    importance_type='gain',
)

In [None]:
# from sklearn.utils.class_weight import compute_class_weight
# compute_class_weight('balanced', np.unique(y), y)

# Manually define class weight
# class_weight = None
# class_weight = "balanced"
class_weight = "custom"

if class_weight == "custom":
    # See output/classifiers/gridsearch_class_weights_4classes.csv
    params['class_weight'] = {'LIGHT': 1, 'DEEP': 1.2, 'REM': 1.2, 'WAKE': 1}
else:
    params['class_weight'] = class_weight

****

## Fit the training set and export the trained classifier

In [None]:
# Parallel processing when building the trees.
params['n_jobs'] = 12

# Loop across combs of predictors
for name, X in tqdm(X_all.items()):
    
    # Skip to full model
    # if name != "eeg+eog+emg+demo":
    #    continue
    
    # Fit
    clf = LGBMClassifier(**params)
    clf.fit(X, y)

    # Print the accuracy on the training dataset: shouldn't be too high..!
    print("%s (%i features) - training accuracy: %.3f" % 
        (name, X.shape[1], clf.score(X, y)))
    
    # Export trained classifier
    if params['class_weight'] is not None:
        fname = outdir + 'clf_%s_lgb_%s_%s_4classes.joblib' % \
        (name, params['boosting_type'], class_weight)
    else:
        fname = outdir + 'clf_%s_lgb_%s_4classes.joblib' % \
        (name, params['boosting_type'])
        
    # Export model
    joblib.dump(clf, fname, compress=True)
    
    # Export LGBM feature importance
    # df_imp = pd.Series(clf.feature_importances_, index=clf.feature_name_, name='Importance').round()
    # df_imp.sort_values(ascending=False, inplace=True)
    # df_imp.index.name = 'Features'
    # df_imp.to_csv(fname[:-7] + ".csv")