## Intended purpose:
Demonstrate the effect of changing window size on CNN baseline
Change validation dataset to find out how validation set is sampled

## Issues fixed in this notebook:


In [None]:
!pip install wandb -qU
!pip install keras-tuner
import wandb
wandb.login()

In [2]:
import os
from typing import Dict
import warnings

from google.colab import drive
import numpy as np
import pandas as pd
import keras
from keras import Sequential
from keras.layers import (
    Conv1D,
    MaxPooling1D,
    LSTM,
    Dense,
    ReLU,
    Dropout,
    GlobalAveragePooling1D,
    BatchNormalization,
)
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical, plot_model
from keras.metrics import Precision, Recall, AUC, F1Score
from keras.losses import CategoricalCrossentropy
from sklearn.metrics import confusion_matrix, classification_report

import keras_tuner
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

warnings.filterwarnings("ignore")

In [None]:
test_filenames = [
    "S1-ADL4_sensors_data.txt",
    "S1-ADL5_sensors_data.txt",
    "S2-ADL4_sensors_data.txt",
    "S2-ADL5_sensors_data.txt",
    "S3-ADL4_sensors_data.txt",
    "S3-ADL5_sensors_data.txt",
    "S4-ADL4_sensors_data.txt",
    "S4-ADL5_sensors_data.txt",
]

config = {
    "objective": "CNN-LSTM check - Ordonez and Roggen's method",
    "architecture": "CNN combined with LSTMs",
    'method_name':'Ordonez_Roggen',
    "dataset": "Opportunity",
    "empty_fill_method": "linear",
    "learning_rate": 0.0001,
    "window_size": 16,
    "epochs": 20,
    "batch_size": 128,
    "results_directory": "results",
    'validation_split':0.1,
    "test_filenames": test_filenames,
    "project_name": "opportunity",
    'num_classes':5,
}

run = wandb.init(
    project="opportunity-sensors",
    config=config,
)

In [36]:
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint
from keras.callbacks import ModelCheckpoint

wandb_callbacks = [
    WandbMetricsLogger(),
    WandbModelCheckpoint(filepath=config.get('method_name')+"_{{epoch:02d}}.keras"),
    # todo: define these callbacks elsewhere
    # EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
    ModelCheckpoint(filepath="best_model.keras", save_best_only=True, monitor='val_loss'),
]

# add this callback to model callbacks when training

In [None]:
drive.mount("/content/drive")
data_path = "/content/drive/My Drive/opportunity_raw/"
config_path = "/content/drive/My Drive/opportunity_config/"
os.listdir(data_path)

In [6]:
def load_variable_names(filename):
    with open(filename, "r") as file:
        names = yaml.safe_load(file)
    return names

In [7]:
column_names = load_variable_names(
    os.path.join(config_path, "unique_column_names.yaml")
)
locomotion_set = load_variable_names(os.path.join(config_path, "locomotion_set.yaml"))
body_features = load_variable_names(os.path.join(config_path, "body_features.yaml"))
assert type(locomotion_set) == list, "Yaml not loaded correctly"

Load data


In [8]:
def get_file(filepath: str):
    """
    Gets the data file with required subset columns (Locomotion set used here)."""
    df = pd.read_csv(filepath, header=None, names=column_names, delimiter=" ")
    return df[locomotion_set]

In [9]:
def remove_missing_values(df: pd.DataFrame, method="linear", order=None):
    """
    Handling missing values in the dataset.
    Specify method to handle the missing values in method parameter.

    Method value being
        linear does linear interpolation, with extrapolation for edge cases.
        drop drops the rows containing missing values.
        spline performs a spline interpolation with the specified order parameter.

    Params:
    - df (Pandas Dataframe): Dataframe from which missing values need to be removed.
    - method (str): Method used to remove missing values (one of drop, linear or spline).
    - order ()

    Returns:
    - A pandas dataframe object having no missing values.
    """
    # todo: add method for removing missing rows entirely
    # todo allow interpolation only if certain number of columns have missing values
    if not df.empty:
        if method == "drop":
            return df.dropna(axis=0)
        if method == "spline":
            return df.interpolate(method=method, order=order)
        return df.interpolate(method=method, limit_direction="both")
    return df

In [10]:
def apply_sliding_window_combine(
    filepath: str, target_var: str = "Locomotion", window_size=16, overlap=0.5
):
    """
    Apply sliding window transforms to features and target of the given file.

    Params:
    - filepath (str): Path of the csv file that data is contained in.
    - target_var (str): Target variable chosen for the given csv file.
    - window_size (int): Size of one sliding window for the transform.
    - overlap (float): Percentage overlap between two consecutive sliding windows.

    Returns:
    - A pandas dataframe object with shape (samples, window_size, features).
    - A pandas Series object for the specified target variable.
    """
    # todo: error handling for no or multiple target variables.
    # todo: allow alternative saving as a pandas dataframe.

    if target_var is None or type(target_var) != str:
        raise TypeError(
            "Unexpected target variable passed, check for none or multiple target"
        )
    shift_by = int(window_size * (1 - overlap))
    df = get_file(filepath)
    df = remove_missing_values(df, method=config.get('empty_fill_method'))
    window_size = config.get("window_size")
    if window_size <= 1:
        raise ValueError("Window size cannot be less than 2, fix config")

    def sliding_window_processing(data, window_size, shift_by):
        """
        Perform sliding window operations on a single column of a df, return as np array.
        Shape of returned array is (samples, window_size, 1)
        """
        start, end = 0, window_size
        windows = []
        while end <= len(data):
            windows.append(data[start:end])
            start += shift_by
            end += shift_by
        return np.array(windows)

    sliding_windows = {
        feature: sliding_window_processing(df[feature].values, window_size, shift_by)
        for feature in body_features
    }
    target = sliding_window_processing(df[target_var].values, window_size, shift_by)
    combined_data = np.stack(
        [sliding_windows[feature] for feature in body_features], axis=-1
    )

    return combined_data, target

In [11]:
# test filenames according to opportunity challenge
# one issue remains - subject wise training needs to be checked as well

combined_Xtrain = []
combined_train_target = []
combined_Xtest = []
combined_test_target = []

for filepath in os.listdir(data_path):
    if os.path.basename(filepath) in test_filenames:
        # print("test", filepath)
        combined_test, combined_test_t = apply_sliding_window_combine(
            os.path.join(data_path, filepath)
        )
        # print(combined_test.shape, combined_test_t.shape)
        combined_test_target.append(combined_test_t)
        combined_Xtest.append(combined_test)
    else:
        # print("train", filepath)
        combined_train, combined_train_t = apply_sliding_window_combine(
            os.path.join(data_path, filepath)
        )
        # print(combined_train.shape, combined_train_t.shape)
        combined_train_target.append(combined_train_t)
        combined_Xtrain.append(combined_train)


# print(combined_Xtrain[0].shape, combined_Xtrain[1].shape)
final_combined_train = np.concatenate(combined_Xtrain)
final_combined_test = np.concatenate(combined_Xtest)
final_combined_train_target = np.concatenate(combined_train_target)
final_combined_test_target = np.concatenate(combined_test_target)

X_train = np.array(final_combined_train)
X_test = np.array(final_combined_test)

In [12]:
def majority_voting_labels(y_train):
    y_train_adjusted = np.where(y_train > 2, y_train - 1, y_train)
    y_train_majority_voted = np.apply_along_axis(
        lambda x: np.bincount(x).argmax(), 1, y_train_adjusted
    )

    return y_train_majority_voted

In [13]:
y_train_majority = majority_voting_labels(final_combined_train_target)
y_test_majority = majority_voting_labels(final_combined_test_target)

y_train_majority_encoded = to_categorical(y_train_majority)
y_test_majority_encoded = to_categorical(y_test_majority)

assert y_train_majority_encoded.shape[1] == 5, "Label Encoding is incorrect"
assert not np.any(np.isnan(y_train_majority_encoded)), "y_train contains NaN values"
assert X_train.shape[1]==config.get('window_size'), "Incorrect sliding window transform"
assert X_train.shape[2]==77, "Incorrect number of features in training set"
assert not np.any(np.isnan(X_train)), "X_train still has missing values"

In [None]:
y_train_classes, y_train_counts = np.unique(y_train_majority, return_counts=True)
y_test_classes, y_test_counts = np.unique(y_test_majority, return_counts=True)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.bar(y_train_classes, y_train_counts, color="blue", alpha=0.7)
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class Distribution in Training Data")

wandb.log({'train data label distribution':plt})

plt.subplot(1, 2, 2)
plt.bar(y_test_classes, y_test_counts, color="green", alpha=0.7)
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class Distribution in Test Data")

wandb.log({'test data label distribution':plt})

plt.tight_layout()
plt.show()

In [None]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

In [39]:
def build_model():
    input_shape = (config.get('window_size'), 77)

    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=5, input_shape=input_shape, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Conv1D(filters=64, kernel_size=5, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Conv1D(filters=64, kernel_size=5, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Conv1D(filters=64, kernel_size=5, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Conv1D(filters=64, kernel_size=5, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())


    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(GlobalAveragePooling1D())

    model.add(Dense(config.get('num_classes')))
    optimizer = Adam(
        learning_rate=config.get('learning_rate'),
        clipnorm=1.0,
        decay=1e-6,
        amsgrad=True,
    )
    loss_fn = CategoricalCrossentropy(from_logits=True)

    model.compile(
        optimizer=optimizer,
        loss=loss_fn,
        metrics=["accuracy", Precision(), Recall(), AUC(), F1Score(average='weighted')],
    )
    return model

In [None]:
model = build_model()
model.summary()

In [None]:
history = model.fit(X_train, y_train_majority_encoded, epochs=config.get('epochs'), batch_size=config.get('batch_size'), validation_split=config.get('validation_split'),callbacks=wandb_callbacks, shuffle=False)

In [None]:
def evaluate_model(model, X_test, y_test):
    results = model.evaluate(X_test, y_test, verbose=0)
    metrics_names = model.metrics_names

    for name, value in zip(metrics_names, results):
        print(f'{name}: {value}')

    y_pred_prob = model.predict(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)
    y_true = np.argmax(y_test, axis=1)

    class_report = classification_report(y_true, y_pred)
    print('Classification Report:')
    print(class_report)

    cm = confusion_matrix(y_true, y_pred)
    print('Confusion Matrix:')
    print(cm)

    wandb.log({'classification_report':class_report, 'confusion_matrix':cm})

    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')

    wandb.log({'confusion matrix':plt})
    plt.show()

In [None]:
history_dict = history.history

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history_dict['loss'], label='Training Loss')
plt.plot(history_dict['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
wandb.log({'loss_curves':plt})

plt.subplot(1, 2, 2)
plt.plot(history_dict['accuracy'], label='Training Accuracy')
plt.plot(history_dict['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
wandb.log({'accuracy_curves':plt})

plt.tight_layout()
plt.show()

In [None]:
y_pred_prob = model.predict(X_test)

y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test_majority_encoded, axis=1)

cm = confusion_matrix(y_true, y_pred)

fig = plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
wandb.log({'confusion_matrix':fig})
plt.show()

In [None]:
last_model_metrics = model.evaluate(X_test, y_test_majority_encoded, return_dict=True)
wandb.log({'last_model_metrics':last_model_metrics})

In [None]:
best_model = keras.models.load_model('best_model.keras')

best_model_test_metrics = best_model.evaluate(X_test, y_test_majority_encoded, return_dict=True)
wandb.log({'best_model_test_metrics':best_model_test_metrics})

In [None]:
def train_model(model, X_val, y_val, epochs=10, batch_size=64):
    pass

### Tuner not initialized for this notebook, need to change model function for doing so

In [None]:
tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective="val_accuracy",
    max_trials=3,
    executions_per_trial=1,
    overwrite=True,
    directory="results",
    project_name="opportunity",
)

In [None]:
tuner.search(
    X_train,
    y_train_majority_encoded,
    epochs=config.get("epochs"),
    validation_split=0.2,
    callbacks=wandb_callbacks,
)

In [None]:
tuner.search_space_summary()

In [None]:
models = tuner.get_best_models(num_models=2)
best_model = models[0]
best_model.summary()

In [None]:
tuner.results_summary()

In [None]:
def convert_history_to_dict(history) -> Dict:
    pass

In [None]:
def plot_loss_curves(history_dict):
    pass