# EEG States
Training a model on manually labelled EEG data to find various states.

In [None]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import run_yasa
import logging
import mne
import yasa
import os
import argparse
import os

import mne
import numpy as np
import pandas as pd
from brainflow.board_shim import BoardShim, BoardIds
from brainflow.data_filter import DataFilter
log = lambda msg: logging.info(msg)


# Load data

In [None]:
input_dir = "C:\\dev\\play\\brainwave-data"
stats_df = pd.read_csv(input_dir + os.path.sep + "stats.csv")


In [70]:
from models.eeg_states.eeg_states import load_events

nights = load_events()

In [72]:
nights["event"].value_counts()

event
tired         187
tired_long     74
wired          48
wired_long     24
Name: count, dtype: int64

## EEG events - debounce
A long event registers as an initial short event

In [68]:
from models.eeg_states.eeg_states import debounce_events

nights = debounce_events(nights)

In [69]:
nights["event"].value_counts()

event
tired         114
tired_long     74
wired          25
wired_long     24
Name: count, dtype: int64

## EEG events - find the duration

In [None]:
from models.eeg_states.eeg_states import find_durations

nights = find_durations(nights)

In [None]:
from models.eeg_states.eeg_states import convert_timestamps

nights = convert_timestamps(nights)

## Load other data

In [None]:
import os

dfs = []

for root, dirs, files in os.walk(input_dir):
    for idx, dir_name in enumerate(dirs):
        input_file = os.path.join(root, dir_name, "raw.post_human.csv")
        if os.path.exists(input_file):
            df = pd.read_csv(input_file)
            dfs.append(df)

yasa_df = pd.concat(dfs, ignore_index=True)

In [None]:
yasa_df_orig = yasa_df.copy()
assert yasa_df.index.is_unique, "Index is not unique"

In [None]:
yasa_df.shape

In [None]:
yasa_df['dayAndNightOf'].value_counts()

In [None]:
yasa_df[yasa_df['Epoch'] == 100].head()

# Prepare data - find if in state

In [None]:
from sleep_events import convert_timestamps_to_uk

convert_timestamps_to_uk(yasa_df, 'Timestamp', 'TimestampUK')

In [None]:
from models.eeg_states.eeg_states import process_row
import pandas as pd
from tqdm import tqdm

# Pre-filter the data as it takes ages
first_timestamp_uk = nights['TimestampUK'].min()
filtered_yasa_df = yasa_df[yasa_df['TimestampUK'] >= first_timestamp_uk]

# Add new columns to yasa_df to store the epoch type
yasa_df = filtered_yasa_df.copy() # defragment
yasa_df['epoch_type'] = None
yasa_df['matched_night_event'] = None

# Iterate over each row in yasa_df
for i, yasa_row in tqdm(yasa_df.iterrows(), total=yasa_df.shape[0]):
    epoch_type, matched_night_event = process_row(yasa_row, nights)
    yasa_df.at[i, 'epoch_type'] = epoch_type
    yasa_df.at[i, 'matched_night_event'] = matched_night_event

In [None]:
yasa_df['epoch_type'].value_counts()

In [None]:
from memory import garbage_collect

garbage_collect(log)

In [None]:
from models.eeg_states.eeg_states_model import model_pipeline

models_and_data = [model_pipeline('main', yasa_df, "epoch_type")]

In [None]:
from models.eeg_states.eeg_states_model import ModelAndData
from sklearn.model_selection import train_test_split

def split(modelAndData: ModelAndData):
    X_train_to_sleep, X_val_to_sleep, y_train_to_sleep, y_val_to_sleep = train_test_split(modelAndData.X, modelAndData.y, test_size=0.2, random_state=42)
    modelAndData.X_train = X_train_to_sleep
    modelAndData.y_train = y_train_to_sleep
    modelAndData.X_val = X_val_to_sleep
    modelAndData.y_val = y_val_to_sleep

for md in models_and_data:
    split(md)
    print(f"Training set size {md.name}: {len(md.X_train)}, validation set size: {len(md.X_val)}")

# Train Catboost model

In [74]:
from catboost import CatBoostClassifier

def train(md: ModelAndData):
    md.model = CatBoostClassifier(
        eval_metric='Logloss',    
        loss_function='CrossEntropy',
        iterations=1000,                # Number of boosting iterations
        learning_rate=0.03,             # Learning rate
        depth=6,                        # Depth of the tree
        l2_leaf_reg=3,                  # L2 regularization term on weights
        early_stopping_rounds=50        # Early stopping rounds
    )

    log(f"Training model for {md.name}")
    md.model.fit(md.X_train, md.y_train, verbose=100)

for md in models_and_data:
    train(md)

0:	learn: 0.6704429	total: 5.72ms	remaining: 5.72s
100:	learn: 0.0578121	total: 422ms	remaining: 3.75s
200:	learn: 0.0155215	total: 812ms	remaining: 3.23s
300:	learn: 0.0073068	total: 1.22s	remaining: 2.83s
400:	learn: 0.0046872	total: 1.58s	remaining: 2.36s
500:	learn: 0.0034136	total: 1.98s	remaining: 1.97s
600:	learn: 0.0027426	total: 2.38s	remaining: 1.58s
700:	learn: 0.0025003	total: 2.79s	remaining: 1.19s
800:	learn: 0.0025003	total: 3.22s	remaining: 801ms
900:	learn: 0.0025003	total: 3.59s	remaining: 394ms
999:	learn: 0.0025003	total: 3.94s	remaining: 0us


## Evaluate the model

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

def evaluate_model(md: ModelAndData, model, X, y):
    # Predict the target values using the trained model
    predictions = model.predict(X)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y, predictions)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y, predictions)

    # Calculate Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)

    print(f"{md.name} Mean Absolute Error (MAE): {mae}")
    print(f"{md.name} Mean Squared Error (MSE): {mse}")
    print(f"{md.name} Root Mean Squared Error (RMSE): {rmse}")

    return mae, mse, rmse

def evaluate_classification_model(md: ModelAndData, X_train, y_train, X_val, y_val):
    model = md.model
    print("Evaluation for model: ", md.name)
    # Evaluate the model on the training set
    print(f"{md.name} Training Set Evaluation:")
    train_mae, train_mse, train_rmse = evaluate_model(md, model, X_train, y_train)
    
    # Evaluate the model on the validation set
    print(f"{md.name} Validation Set Evaluation:")
    val_mae, val_mse, val_rmse = evaluate_model(md, model, X_val, y_val)

    val_train = model.predict(X_train)

    train_results_df = pd.DataFrame({
        'Actual': y_train,
        'Predicted': val_train
    })

    cm_train = confusion_matrix(train_results_df['Actual'], train_results_df['Predicted'])

    # Plot the confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix (training)')
    plt.show()

    
    # Predict the target values using the trained model
    val_predictions = model.predict(X_val)
    print(val_predictions)
    
    # Create a DataFrame with y_val and the predictions
    val_results_df = pd.DataFrame({
        'Actual': y_val,
        'Predicted': val_predictions
    })

    cm_val = confusion_matrix(val_results_df['Actual'], val_results_df['Predicted'])

    # Plot the confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm_val, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix (validation)')
    plt.show()
    
for md in models_and_data:
    evaluate_classification_model(md, md.X_train, md.y_train, md.X_val, md.y_val)
#eval2(model_to_ready_to_sleep, X_train_to_ready_to_sleep, y_train_to_ready_to_sleep, X_val_to_ready_to_sleep, y_val_to_ready_to_sleep)

In [None]:
models_and_data[0].model.predict_proba(models_and_data[0].X_val)

## Most useful features

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def importances(md: ModelAndData):
    # Get feature importances
    feature_importances = md.model.get_feature_importance()
    
    # Create a DataFrame to display the feature importances
    feature_names = md.X_train.columns
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    })
    importance_df = importance_df[importance_df['Importance'] > 1]
    
    # Sort the DataFrame by importance
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    
    # Plot the feature importances
    plt.figure(figsize=(12, 8))
    plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title(md.name + ' Feature Importances')
    plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
    plt.show()

for md in models_and_data:
    importances(md)

# Save model

In [75]:
for md in models_and_data:
    model_filename = f"{md.name}_catboost_model.cbm"
    md.model.save_model(model_filename)
    print(f"Model saved to {model_filename}")

Model saved to main_catboost_model.cbm
