# Tutorial Notebook



In [None]:
%load_ext autoreload
%autoreload 2

## Directories

In [None]:
import os
import sys

base_directory = "."
repo = "."

In [None]:
%%capture
!pip install numpy scipy scikit-learn seaborn matplotlib pandas lightly

In [None]:
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import random
import glob
from datetime import date
import itertools
import pickle

# progress bar
from tqdm.auto import tqdm

# Various metrics
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Setting seeds
np.random.seed(42)
random.seed(42)

In [None]:
os.makedirs(base_directory, exist_ok=True)
%cd $base_directory

In [None]:
def plot_accelerometer_timeseries(timeseries, title="", figsize=(15,6), xlim=None, ylim=None):
    plt.figure(figsize=figsize)
    if len(timeseries.shape) == 1:
        plt.plot(timeseries)
    else:
        for i in range(3):
            plt.plot(timeseries[:, i], label=["x","y","z"][i])
    if xlim is None:
        plt.xlim(0, len(timeseries))
    else:
        plt.xlim(*xlim)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.title(title, fontsize=16)
    plt.legend()
    plt.show()

label_mapping = {
    0: "Downstairs",
    1: "Upstairs",
    2: "Walking",
    3: "Jogging",
    4: "Standing",
    5: "Sitting",
}
num_ticks = 6
cm_ticks = np.linspace(0, num_ticks-1, num_ticks, dtype=int)
cm_ticklabels = [label_mapping[idx] for idx in cm_ticks]


# 0 Data Preparation

## 0.1 **Downloading the Motionsense data**

Already available in the Github repo! So, there is not need to download!

## 0.2 Data pre-processing

In [None]:
import data_preparation.prepare_motionsense as prepare

# Loading the arguments first
args = prepare.load_args()
print(args)

# Obtaining the processed data
processed = prepare.prepare_data(args)

In [None]:
plt.style.use('ggplot')
fig, axs = plt.subplots(1, 3, figsize=(15, 6))
for i, split, title in zip(range(3), ['train', 'val', 'test'], ["Train", "Val", "Test"]):
    labels, counts = np.unique(processed[split]['labels'], return_counts=True)
    axs[i].bar(labels, counts)
    axs[i].set_title(title + " set")
    axs[i].set_xticks(range(6), range(6))
    # axs[i].set_ylim([0, 65000])
axs[0].set_ylabel("Count")
axs[1].set_xlabel("Classes")
# plt.ylim([0, 65000])
plt.show()

In [None]:
plot_accelerometer_timeseries(processed['train']['data'][:100], f"Training accelerometer trace - Activity {processed['train']['labels'][100]}", figsize=(10,4))


# 1 Activity Recognition Chain

## 1.0 **Segmentation: obtaining windows of sensor data through sliding window**

### Goal: take stream of sensor data and return windows + labels


In [None]:
processed_data_files = glob.glob(os.path.join(repo, "data_preparation", "all_data", "*", "motionsense.pkl"))
processed_data_files.sort(key=os.path.getmtime)
processed_data_file = processed_data_files[-1]
processed = pd.read_pickle(processed_data_file)
processed_data_file

In [None]:
import ecdf.extract_ecdf_train_classifier as ecdf

# Obtaining the segmented data
segmented_data = ecdf.generate_windowed_data(processed=processed)

### Before Segmentation

In [None]:
plot_accelerometer_timeseries(processed['train']['data'][:150], f"Before segmentation - Activity {processed['train']['labels'][150]}", ylim=(-3,2), figsize=(10,3))

### After Segmentation

In [None]:
for i in range(3):
    plot_accelerometer_timeseries(segmented_data['train']['data'][i], f"After segmentation - Segment {i} - Activity {segmented_data['train']['labels'][i]}", xlim=(-i*50, 150-i*50), ylim=(-3,2), figsize=(10,3))

## 1.1 **Extracting features: ECDF**

In [None]:
# Computing the ECDF features
ecdf_features = ecdf.compute_ecdf_features(segmented_data=segmented_data)

## 1.2 **Training a Random Forest classifier with ECDF features for Activity Recognition**

In [None]:
# Training the RF classifier
trained_classifier, log_ecdf = ecdf.train_rf_classifier(ecdf=ecdf_features, segmented_data=segmented_data)

In [None]:
confusion_matrix_ecdf = confusion_matrix(segmented_data["test"]["labels"], trained_classifier.predict(ecdf_features["test"]))
confusion_matrix_ecdf_norm = confusion_matrix_ecdf / np.sum(confusion_matrix_ecdf, axis=1, keepdims=True)
os.makedirs(os.path.join(repo, "ecdf", "saved_logs", "current"), exist_ok=True)
with open(os.path.join(repo, "ecdf", "saved_logs", "current", "ecdf_eval_log.pkl"), 'wb') as f:
  pickle.dump(
      {"cm": confusion_matrix_ecdf_norm,
       "f1": log_ecdf
      },
  f)

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(confusion_matrix_ecdf_norm, annot=True, fmt='.1%', cmap='Blues', annot_kws={"fontsize":8}, yticklabels=cm_ticklabels, xticklabels=cm_ticklabels)
plt.xticks(rotation=45)
plt.show()

# 2 Convolutional Classifier

In [None]:
from conv_classifier import evaluate_with_classifier
from conv_classifier import model
from conv_classifier import arguments_dict
from conv_classifier import utils


In [None]:
args = arguments_dict.load_args()
print(args)

In [None]:
processed_data_files = glob.glob(os.path.join(repo, "data_preparation", "all_data", "*", "motionsense.pkl"))
processed_data_files.sort(key=os.path.getmtime)
processed_data_file = processed_data_files[-1]

In [None]:
args['root_dir'] = os.path.split(processed_data_file)[0]

In [None]:
print(model.Classifier(args=args))

In [None]:
utils.set_all_seeds(args['random_seed'])
args['num_epochs'] = 50

In [None]:
evaluate_with_classifier(args=args)

In [None]:
# First we load the logs
log_files = glob.glob(os.path.join(repo, "conv_classifier", "saved_logs", "*", "classifier*_log.pkl"))
log_files.sort(key=os.path.getmtime)
log_file = log_files[-1]
logs_simclr = pd.read_pickle(log_file)


plt.figure(dpi=200, figsize=(8, 4))
plt.style.use('ggplot')

plt.subplot(1, 2, 1)
plt.plot(np.arange(len(logs_simclr.loss['train'])), logs_simclr.loss['train'], label='Train loss')
plt.plot(np.arange(len(logs_simclr.loss['val'])), logs_simclr.loss['val'], label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.plot(np.arange(len(logs_simclr.f1_score['train'])), logs_simclr.f1_score['train'], label='Train F1-score')
plt.plot(np.arange(len(logs_simclr.f1_score['val'])), logs_simclr.f1_score['val'], label='Validation F1-score')
plt.plot(np.arange(len(logs_simclr.f1_score['test'])), logs_simclr.f1_score['test'], label='Test F1-score')

# plotting the best val F1-score
plt.plot(logs_simclr.best_meter.epoch, logs_simclr.best_meter.f1_score['val'], 'rx')
plt.plot(logs_simclr.best_meter.epoch, logs_simclr.best_meter.f1_score['test'], 'r+')

plt.xlabel('Epoch')
plt.ylabel('F1-score')

plt.legend()

# 3 SimCLR for learning representations

## 3.0 Import & Data loading

In [None]:
from simclr import arguments_dict
from simclr import utils
from simclr import pretrainer
from simclr import model

In [None]:
args = arguments_dict.load_args()
print(args)

Updating the location of the processed data.

In [None]:
processed_data_files = glob.glob(os.path.join(repo, "data_preparation", "all_data", "*", "motionsense.pkl"))
processed_data_files.sort(key=os.path.getmtime)
processed_data_file = processed_data_files[-1]

In [None]:
args['root_dir'] = os.path.split(processed_data_file)[0]

## 3.1 Pre-training using SimCLR

Setting the seeds for pre-training

In [None]:
utils.set_all_seeds(args['random_seed'])

Next, let us print the SimCLR model architecture

In [None]:
simclr_model = model.SimCLR(args=args)
print(simclr_model)

### Starting the pre-training using the SimCLR

In [None]:
pretrainer.learn_model(args=args)

### Plotting the loss values to see the trends

In [None]:
# First we load the logs
log_files = glob.glob(os.path.join(repo, "simclr", "saved_logs", "*", "simclr_*.pkl"))
log_files.sort(key=os.path.getmtime)
log_files = list(filter(lambda x: not x.endswith("_eval_log.pkl"), log_files))
log_file = log_files[-1]
logs = pd.read_pickle(log_file)

plt.figure(dpi=200)
plt.style.use('ggplot')

plt.plot(np.arange(len(logs.loss['train'])), logs.loss['train'], label='Train loss')
plt.plot(np.arange(len(logs.loss['val'])), logs.loss['val'], label='Validation loss')

plt.legend()

## 3.2 Classification with the learned features.

For that, we first set the location of the trained model, so we can load the learned weights  

In [None]:
from simclr import evaluate_with_classifier

In [None]:
print(model.Classifier(args=args))

In [None]:
saved_model_folders = glob.glob(os.path.join(repo, "simclr", "saved_weights", "*"))
saved_model_folders.sort(key=os.path.getmtime)
saved_model_folder = saved_model_folders[-1]

args['saved_model_folder'] = saved_model_folder

In [None]:
evaluate_with_classifier(args=args)

### We can now plot the loss and f1-scores to see how performance improved.

In [None]:
# First we load the logs
log_files = glob.glob(os.path.join(repo, "simclr", "saved_logs", "*", "simclr*_eval_log.pkl"))
log_files.sort(key=os.path.getmtime)
log_file = log_files[-1]
logs_simclr = pd.read_pickle(log_file)


plt.figure(dpi=200, figsize=(8, 4))
plt.style.use('ggplot')

plt.subplot(1, 2, 1)
plt.plot(np.arange(len(logs_simclr.loss['train'])), logs_simclr.loss['train'], label='Train loss')
plt.plot(np.arange(len(logs_simclr.loss['val'])), logs_simclr.loss['val'], label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.plot(np.arange(len(logs_simclr.f1_score['train'])), logs_simclr.f1_score['train'], label='Train F1-score')
plt.plot(np.arange(len(logs_simclr.f1_score['val'])), logs_simclr.f1_score['val'], label='Validation F1-score')
plt.plot(np.arange(len(logs_simclr.f1_score['test'])), logs_simclr.f1_score['test'], label='Test F1-score')

# plotting the best val F1-score
plt.plot(logs_simclr.best_meter.epoch, logs_simclr.best_meter.f1_score['val'], 'rx')
plt.plot(logs_simclr.best_meter.epoch, logs_simclr.best_meter.f1_score['test'], 'r+')

plt.xlabel('Epoch')
plt.ylabel('F1-score')

plt.legend()

We can also visualize the confusion matrix to see where the classes get confused.

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(logs_simclr.best_meter.confusion_matrix['test'], annot=True, fmt='.1%', cmap='Blues', annot_kws={"fontsize":8}, yticklabels=cm_ticklabels, xticklabels=cm_ticklabels)
plt.xticks(rotation=45)
plt.show()

# 4 Result comparison

In [None]:
with open(os.path.join(repo, "ecdf", "saved_logs", "current", "ecdf_eval_log.pkl"), 'rb') as f:
    ecdf_obj = pickle.load(f)
    confusion_matrix_ecdf_norm = ecdf_obj["cm"]
    f1_ecdf_test = ecdf_obj["f1"]["test"]

log_files = glob.glob(os.path.join(repo, "conv_classifier", "saved_logs", "*", "classifier*_eval_log.pkl"))
log_files.sort(key=os.path.getmtime)
log_file = log_files[-1]
logs_conv_classifier = pd.read_pickle(log_file)

log_files = glob.glob(os.path.join(repo, "simclr", "saved_logs", "*", "simclr*_eval_log.pkl"))
log_files.sort(key=os.path.getmtime)
log_file = log_files[-1]
logs_simclr = pd.read_pickle(log_file)

confusion_matrices = {
    "ECDF": confusion_matrix_ecdf_norm,
    "DeepConvLSTM": logs_conv_classifier.best_meter.confusion_matrix['test'],
    "SimCLR": logs_simclr.best_meter.confusion_matrix['test']
}

In [None]:
plt.bar(
    ["ECDF", "Conv Classifier", "SimCLR"],
    [
        f1_ecdf_test,
        logs_conv_classifier.best_meter.f1_score['test'],
        logs_simclr.best_meter.f1_score['test']
    ],
    color=['tab:blue', 'tab:red', 'tab:green']
)
plt.ylim([0.5, 1.0])
plt.ylabel("F1 Score")
plt.xlabel("Training Method")
plt.show()

In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(20, 6))
for i, method in enumerate(confusion_matrices):
    sns.heatmap(
        confusion_matrices[method],
        ax=axs[i],
        annot=True, fmt='.1%', cmap='Blues', annot_kws={"fontsize":8},
        vmin=0, vmax=1, cbar=False,
        yticklabels=cm_ticklabels, xticklabels=cm_ticklabels
    )
    axs[i].set_xticks(axs[i].get_xticks())
    axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=45, ha='right')
    # axs[i].tick_params(axis='both', which='major', labelsize=6)
    axs[i].set_title(method)
fig.tight_layout()
