# Table of Contents

* [0. Data loading](#section_0)
* [1. EDA](#section_1)
* [2. Setup validation and baselines](#section_2)
* [3. Feature engineering](#section_3)
* [4. Modeling](#section_4)
    * [4.1 Logistic regression](#section_4_1)
    * [4.2 Random Forest](#section_4_2)
* [5. Other experiments](#section_5)
    * [5.1 Using raw data + smaller window size](#section_5_1)
    * [5.2 Median filter visualization](#section_5_2)

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import seaborn as sns
from matplotlib import pyplot as plt
sns.set(
    style="whitegrid",
    font_scale=2
);

# 0. Data loading <a class="anchor" id="section_0"></a>

In [None]:
DATA_FOLDER = Path('../input/data-for-activity-recognition/data/data')
FRAME_LENGTH = 30
classes = [f.name for f in DATA_FOLDER.iterdir()]
classes

In [None]:
x_acc_cols = [f'acc_x_{i}' for i in range(FRAME_LENGTH)]
y_acc_cols = [f'acc_y_{i}' for i in range(FRAME_LENGTH)]
z_acc_cols = [f'acc_z_{i}' for i in range(FRAME_LENGTH)]

frames = []
labels = []

for class_name in classes:
    class_folder = DATA_FOLDER / class_name
    for file in class_folder.iterdir():
        df = pd.read_csv(file)
        flat_frame = np.hstack([df['accelerometer_X'], df['accelerometer_Y'], df['accelerometer_Z']]).astype(float)
        frames.append(flat_frame)
        labels.append(class_name)
        
raw_data = pd.DataFrame(frames, columns = x_acc_cols+y_acc_cols+z_acc_cols)
raw_data = pd.concat([raw_data, pd.Series(labels, name='label')], axis=1)
raw_data.head(3)

In [None]:
raw_data.shape

# 1. EDA <a class="anchor" id="section_1"></a>

We can see that there is class imbalance: only 165 samples of 'stairs' class

In [None]:
raw_data.label.value_counts()

## Distibution of readings across each axis

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16,6))
axes = axes.flatten()

for label, cols, ax in zip(['X', 'Y', 'Z'], [x_acc_cols, y_acc_cols, z_acc_cols], axes):
    sns.distplot(raw_data[cols].values.flatten(), ax=ax)
    ax.set_title(f'Distr. of {label} axis readings');
    ax.set_xlabel('m/$s^2$');
    ax.set_ylabel('')

We can see that all readings are centered around zero. X axis is approximately normally distributed, while Y and Z looks like bimodal distributions.

## t-SNE visualization

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(raw_data[x_acc_cols+y_acc_cols+z_acc_cols])

In [None]:
df_subset = pd.DataFrame({
    'tsne-2d-0': tsne_results[:,0],
    'tsne-2d-1': tsne_results[:,1],
    'label': raw_data['label'],
})

plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-0", y="tsne-2d-1",
    hue="label",
    palette=sns.color_palette("hls", 4),
    data=df_subset,
    legend="full",
    alpha=0.3
);

t-SNE transformation, applied on raw accelerometer data in a frame of length 30, is visualized on 2d plane. We can see 3 clusters here:
- running
- idle
- stairs + walking

## Sample activities

Visualize one sample for each activity

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 20))
axes = axes.flatten()


steps = np.arange(30)

for ax, label in zip(axes, classes):
    sample = raw_data[raw_data['label'] == label].iloc[0]
    sns.lineplot(y=sample[x_acc_cols].astype(float), x=steps, label='X', ax=ax)
    sns.lineplot(y=sample[y_acc_cols].astype(float), x=steps, label='Y', ax=ax)
    sns.lineplot(y=sample[z_acc_cols].astype(float), x=steps, label='Z', ax=ax).set_title(label)
    ax.set_ylim(-40, 40)
    ax.set_xlabel('time step')
    ax.set_ylabel('m/$s^2$')

## Mean for each axis

We can see, that calculating mean for every axis is enough to separate all classes for each other, except for 'walking' vs 'stairs'

In [None]:
mean_x = raw_data[x_acc_cols].mean(axis=1)
mean_y = raw_data[y_acc_cols].mean(axis=1)
mean_z = raw_data[z_acc_cols].mean(axis=1)

simple_features = pd.DataFrame({
    'mean_x': mean_x,
    'mean_y': mean_y,
    'mean_z': mean_z,
    'label': raw_data['label']
})

In [None]:
print('Pairplot of mean accelerometer value during frame of length 30');
sns.pairplot(data=simple_features, hue='label', height=5);


# 2. Setup validation and baselines <a class="anchor" id="section_2"></a>

## StratifiedKFold

In [None]:
from sklearn.model_selection import StratifiedKFold


kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [None]:
X = raw_data['acc_x_0']
y = raw_data['label']

for i, (_, valid_ix) in enumerate(kfold.split(X, y)):
    y_valid = y.loc[valid_ix]
    value_counts = y_valid.value_counts()
    print(f'split#{i}, we have {value_counts.stairs} "stairs", {value_counts.running} "running" samples')
#     print(    (y_valid == 'stairs').index)

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# target label encoder
target_le = LabelEncoder()
target_le.fit(classes)

## Baselines

In [None]:
from sklearn import svm
from sklearn import linear_model
from sklearn.metrics import f1_score, confusion_matrix, multilabel_confusion_matrix, accuracy_score
from IPython.display import display, HTML

In [None]:
numerical_metrics = ['accuracy', 'f1_macro', 'f1_min']

def evaluate_performance(y_true, y_pred):
    """Calculates metrics and returns result as dict"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'f1_min': f1_score(y_true, y_pred, average=None).min(),
        'CM': confusion_matrix(y_true, y_pred, normalize='true'),
        'CM_not_normalized': confusion_matrix(y_true, y_pred),
    }
    
    return metrics

def plot_confusion_matrix(df, title=None):
    # C_ij - i is true, predicted in j
    cm = np.stack(df['CM'].values, axis=0).mean(axis=0)
    cm_nn = np.stack(df['CM_not_normalized'].values, axis=0).mean(axis=0)
    cm_print = np.empty_like(cm_nn).astype(str)
    for i in range(len(cm_print)):
        for j in range(len(cm_print)):
            cm_print[i,j] = ("%.2f" % (cm[i,j]*100))+'%\n'+str(cm_nn[i,j])
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.3)
    labels_ordered = target_le.inverse_transform(range(4))
    sns.heatmap(cm,
                annot=cm_print,
                fmt='',
                cmap='Blues',
                cbar=False,
                xticklabels=labels_ordered,
                yticklabels=labels_ordered,
               )
    plt.xlabel('predicted')
    plt.ylabel('actual');
    plt.title(title);
    
def print_cv_metrics(cv_metrics, title=None):
    """Displays results on cross validation"""
    df = pd.DataFrame(cv_metrics)
    plot_confusion_matrix(df, title=title);
    plt.show()
    print_df = pd.concat([df.mean()[numerical_metrics], df.std()[numerical_metrics]], axis=1)
    print_df = print_df.applymap(lambda x: round(x, 3))
    print_df.columns = ['mean', 'std']
    display(HTML(print_df.T.to_html()))    
    

In [None]:
X = simple_features[['mean_x', 'mean_y', 'mean_z']]
y = simple_features['label']
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    
    cls = linear_model.LogisticRegression()
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Baseline logistic regression')

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = simple_features[['mean_x', 'mean_y', 'mean_z']]
y = simple_features['label']
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
#     scaler = StandardScaler()
#     X_train = scaler.fit_transform(X_train)
#     X_valid = scaler.transform(X_valid)
    
    cls = RandomForestClassifier()
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Baseline Random Forest')

# 3. Feature engineering <a class="anchor" id="section_3"></a>

In [None]:
def extract_features(x_part, y_part, z_part):
    """
    Extract features from accelerometer readings.
    
    First, magnitude across 3 axes is calculed and added as 4ht timeseries 'm'
    Then, the followng features for each axis is calculated:
    - mean: simple average for each axis
    - minmax: difference between max value and min value for each axis
    - min: minumum value for each axis
    - rms: root mean square for each axis
    - corr_xy, corr_yz, corr_xz - Pearson correlation coefs. for corresponding axis.
    
    as proposed in 'A Study on Human Activity Recognition Using Accelerometer Data from Smartphones' 
    https://www.sciencedirect.com/science/article/pii/S1877050914008643

    Parameters:
            x_part, y_part, z_part: Arrays of shape n_samples x FRAME_LENGTH
            with accelerometer readings

    Returns:
            features (DataFrame): DataFrame with generated features (19 features in total)
    """
    magnitude_part = np.sqrt(x_part**2 + y_part**2 + z_part**2)
    ts_matrix = np.stack([x_part, y_part, z_part, magnitude_part], axis=1)
    ts_ax_names = ['x', 'y', 'z', 'm']
    assert len(ts_ax_names) == ts_matrix.shape[1]
    
    mean_features = pd.DataFrame(ts_matrix.mean(axis=2), columns=[f'mean_{ax}' for ax in ts_ax_names])
    
    minmax_m = ts_matrix.max(axis=2) - ts_matrix.min(axis=2)
    minmax_features = pd.DataFrame(minmax_m, columns=[f'minmax_{ax}' for ax in ts_ax_names])
    
    std_m = ts_matrix.std(axis=2)
    std_features = pd.DataFrame(std_m, columns=[f'std_{ax}' for ax in ts_ax_names])
    
    min_m = ts_matrix.min(axis=2)
    min_features = pd.DataFrame(min_m, columns=[f'min_{ax}' for ax in ts_ax_names])
    
    rms_m = np.sqrt(np.square(ts_matrix).mean(axis=2))
    rms_features = pd.DataFrame(rms_m, columns=[f'rms_{ax}' for ax in ts_ax_names])
    
    ix = np.arange(len(ts_matrix))
    iy = ix + len(ts_matrix)
    corr_xy = np.corrcoef(ts_matrix[:, 0, :], ts_matrix[:, 1, :])[ix,iy]
    corr_yz = np.corrcoef(ts_matrix[:, 1, :], ts_matrix[:, 2, :])[ix,iy]
    corr_xz = np.corrcoef(ts_matrix[:, 0, :], ts_matrix[:, 2, :])[ix,iy]
    corr_features = pd.DataFrame({'corr_xy': corr_xy, 'corr_yz': corr_yz, 'corr_xz': corr_xz})
    
    features = pd.concat([mean_features, minmax_features, min_features, rms_features, corr_features], axis=1)
    return features

In [None]:
feature_df = extract_features(
    raw_data[x_acc_cols].values,
    raw_data[y_acc_cols].values,
    raw_data[z_acc_cols].values)

In [None]:
feature_df['label'] = target_le.transform(raw_data['label'])

In [None]:
all_features = [c for c in feature_df.columns if c != 'label']
print('len(all_features): ', len(all_features))
all_features

# 4. Modeling <a class="anchor" id="section_4"></a>

## 4.1 Logistic regression <a class="anchor" id="section_4_1"></a>

In [None]:
X = feature_df[all_features]
y = feature_df['label']
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    
    cls = linear_model.LogisticRegression(max_iter=1_000)
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Logistic Regression on all features')

### Adding polynomial features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_transform = PolynomialFeatures(2, include_bias=False)

In [None]:
X = poly_transform.fit_transform(feature_df[all_features])
y = feature_df['label'].values
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X[train_ix], X[valid_ix]
    y_train, y_valid = y[train_ix], y[valid_ix]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    
    cls = linear_model.LogisticRegression(max_iter=1_000)
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Logistic Regressions on all features + Polynomial degree 2')

### Feature selection with L1 regularization

In [None]:
X = poly_transform.fit_transform(feature_df[all_features])
y = feature_df['label'].values

def l1_objective(C=1.0):
    cv_metrics = []

    for train_ix, valid_ix in kfold.split(X, y):
        X_train, X_valid = X[train_ix], X[valid_ix]
        y_train, y_valid = y[train_ix], y[valid_ix]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)

        cls = linear_model.LogisticRegression(penalty='l1', solver='liblinear', C=C)
        cls.fit(X_train, y_train)
        y_pred = cls.predict(X_valid)

        fold_metrics = evaluate_performance(y_valid, y_pred)
        cv_metrics.append(fold_metrics)
    return cv_metrics

In [None]:
from tqdm.cli import tqdm

In [None]:
Cs = np.linspace(1e-3, 1.5, num=20)
f1_values = []
f1_stds   = []

for C in tqdm(Cs):
    cv_metrics = l1_objective(C=C)
    df = pd.DataFrame(cv_metrics)
    print_df = pd.concat([df.mean()[numerical_metrics], df.std()[numerical_metrics]], axis=1)
    print_df.columns = ['mean', 'std']
    f1_macro = print_df.loc['f1_macro']
    f1_values.append(f1_macro['mean'])
    f1_stds.append(f1_macro['std'])


In [None]:
sns.lineplot(x=Cs, y=f1_values)
sns.scatterplot(x=Cs, y=f1_values, color='green')
plt.errorbar(Cs, f1_values, f1_stds, linestyle='None')

plt.xlabel('C')
plt.ylabel('F1 macro')
plt.title('Choosing inverse regularization strength C for Lasso with CV');

In [None]:
Cs[1]

In [None]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

lasso_cls = linear_model.LogisticRegression(penalty='l1', C=.2, solver='liblinear')
lasso_cls.fit(X_scaled, y)

In [None]:
importance_coefs = np.abs(lasso_cls.coef_).max(axis=0)
feature_importance = list(zip(poly_transform.get_feature_names(all_features), importance_coefs))

important_features = [(name, coef) for name, coef in feature_importance]
importance_df = pd.DataFrame(important_features, columns = ['feature_name', 'importance']
                            ).sort_values(by='importance', ascending=False
                            ).reset_index(drop=True)

In [None]:
importance_df.head()

In [None]:
importance_df[importance_df['importance'] > 1e-2].shape

#### Choosing optimal number of features

In [None]:
def l2_objective_n_features(selected_features):
    X = poly_transform.fit_transform(feature_df[all_features])
    X = pd.DataFrame(X, columns = poly_transform.get_feature_names(all_features))
    X = X[selected_features]
    y = feature_df['label']

    cv_metrics = []

    for train_ix, valid_ix in kfold.split(X, y):
        X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
        y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)

        cls = linear_model.LogisticRegression(penalty='l2', max_iter=2_000)
        cls.fit(X_train, y_train)
        y_pred = cls.predict(X_valid)

        fold_metrics = evaluate_performance(y_valid, y_pred)
        cv_metrics.append(fold_metrics)
    return cv_metrics

In [None]:
n_features = np.arange(1, 61)

scores = {
    'selected_features': [],
    'f1_macro': [],
    'f1_macro_std': [],
    'f1_min': [],
    'f1_min_std': [],
}

for n in tqdm(n_features):
    selected_features = importance_df['feature_name'].head(n).values
    cv_metrics = l2_objective_n_features(selected_features)
    df = pd.DataFrame(cv_metrics)
    print_df = pd.concat([df.mean()[numerical_metrics], df.std()[numerical_metrics]], axis=1)
    print_df.columns = ['mean', 'std']
    f1_macro = print_df.loc['f1_macro']
    scores['selected_features'].append(selected_features)
    scores['f1_macro'].append(print_df.loc['f1_macro']['mean'])    
    scores['f1_macro_std'].append(print_df.loc['f1_macro']['std'])
    
    scores['f1_min'].append(print_df.loc['f1_min']['mean'])    
    scores['f1_min_std'].append(print_df.loc['f1_min']['std'])    



In [None]:
plt.figure(figsize=(16,6))
sns.lineplot(x=n_features, y=scores['f1_macro'])
sns.scatterplot(x=n_features, y=scores['f1_macro'], label='f1_macro')

sns.lineplot(x=n_features, y=scores['f1_min'])
sns.scatterplot(x=n_features, y=scores['f1_min'], label='f1_min')

plt.xlabel('Number of features')
plt.ylabel('F1 score')
plt.title('Choosing number of features');

In [None]:
top_23_features = importance_df['feature_name'].head(23).values
top_23_features

### Final model

In [None]:
X = poly_transform.fit_transform(feature_df[all_features])
X = pd.DataFrame(X, columns = poly_transform.get_feature_names(all_features))
X = X[top_23_features]
y = feature_df['label']

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    
    cls = linear_model.LogisticRegression(max_iter=1_000)
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Logistic Regression on top 23 features')

## 4.2 RandomForest <a class="anchor" id="section_4_2"></a>

In [None]:
X = feature_df[all_features]
y = feature_df['label']
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    cls = RandomForestClassifier()
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Random Forest on all features')

In [None]:
X = feature_df[all_features]
y = feature_df['label']

cls = RandomForestClassifier()
cls.fit(X, y)

In [None]:
cls.feature_importances_

In [None]:
plt.figure(figsize=(16, 6));
plt.title('Impurity-based feature importances of the forest')
plot = sns.barplot(x=all_features, y=cls.feature_importances_)
for item in plot.get_xticklabels():
    item.set_rotation(45)

In [None]:
importance_df = pd.DataFrame(zip(all_features, cls.feature_importances_), columns = ['feature_name', 'importance']
                            ).sort_values(by='importance', ascending=False
                            ).reset_index(drop=True)
importance_df.head()

In [None]:
top_10_features = importance_df['feature_name'].head(10).values
top_10_features

In [None]:
def RF_objective_n_features(selected_features):
    X = feature_df[selected_features]
    y = feature_df['label']

    cv_metrics = []

    for train_ix, valid_ix in kfold.split(X, y):
        X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
        y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]

        cls = RandomForestClassifier()
        cls.fit(X_train, y_train)
        y_pred = cls.predict(X_valid)

        fold_metrics = evaluate_performance(y_valid, y_pred)
        cv_metrics.append(fold_metrics)
    return cv_metrics

In [None]:
n_features = np.arange(1, len(all_features)+1)

scores = {
    'selected_features': [],
    'f1_macro': [],
    'f1_macro_std': [],
    'f1_min': [],
    'f1_min_std': [],
}

for n in tqdm(n_features):
    selected_features = importance_df['feature_name'].head(n).values
    cv_metrics = RF_objective_n_features(selected_features)
    df = pd.DataFrame(cv_metrics)
    print_df = pd.concat([df.mean()[numerical_metrics], df.std()[numerical_metrics]], axis=1)
    print_df.columns = ['mean', 'std']
    f1_macro = print_df.loc['f1_macro']
    scores['selected_features'].append(selected_features)
    scores['f1_macro'].append(print_df.loc['f1_macro']['mean'])    
    scores['f1_macro_std'].append(print_df.loc['f1_macro']['std'])
    
    scores['f1_min'].append(print_df.loc['f1_min']['mean'])    
    scores['f1_min_std'].append(print_df.loc['f1_min']['std'])    



In [None]:
plt.figure(figsize=(16,6))
sns.lineplot(x=n_features, y=scores['f1_macro'])
sns.scatterplot(x=n_features, y=scores['f1_macro'], label='f1_macro')

sns.lineplot(x=n_features, y=scores['f1_min'])
sns.scatterplot(x=n_features, y=scores['f1_min'], label='f1_min')

plt.xlabel('Number of features')
plt.ylabel('F1 score')
plt.title('Choosing number of features for Random Forest');

In [None]:
top_7_features = importance_df['feature_name'].head(7).values
top_7_features

In [None]:
X = feature_df[top_7_features]
y = feature_df['label']
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    cls = RandomForestClassifier()
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Random Forest on top 7 features')

### out-of-bag errors

Each tree in Random forest is trained on subsample of the original data. We expect that for each training sample there are trees in our forest that had not seen this sample during training. Thus, we can make a good estimate of **generalization ability** of our forest even without holdout set or crossvalidation.

In [None]:
X = feature_df[top_7_features]
y = feature_df['label']

cls = RandomForestClassifier(n_estimators=100, oob_score=True)
cls.fit(X, y)

In [None]:
oob_preds = cls.oob_decision_function_.argmax(axis=1)
confusion_matrix(y, oob_preds)

In [None]:
print_cv_metrics([evaluate_performance(y, oob_preds)], 'out-of-bag performance of RF on top 7 features')

### Tuning Random Forest on 5 features

In [None]:
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials, space_eval
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample

In [None]:
space = {
    "n_estimators": scope.int(hp.quniform("n_estimators", 20, 1000, 1)),
    "max_depth": hp.choice('max_depth', [None, scope.int(hp.quniform("max_depth_int", 1, 20,1))] ),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
}

MAX_EVALS = 100

In [None]:
top_5_features = importance_df['feature_name'].head(5).values
top_5_features

In [None]:
X = feature_df[top_5_features]
y = feature_df['label']
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    cls = RandomForestClassifier(
        random_state=1
    )
    
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Random Forest on top 5 features with default hyperparams')

In [None]:
X = feature_df[top_5_features]
y = feature_df['label']

def RF_objective_hyperparams(params):
#     print(params)
    cv_metrics = []

    for train_ix, valid_ix in kfold.split(X, y):
        X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
        y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]

        cls = RandomForestClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            criterion=params['criterion'],
            random_state=1
        )
        cls.fit(X_train, y_train)
        y_pred = cls.predict(X_valid)

        fold_metrics = evaluate_performance(y_valid, y_pred)
        cv_metrics.append(fold_metrics)
        
    loss = 1 - pd.DataFrame(cv_metrics)['f1_min'].mean()
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [None]:
%%time
RF_objective_hyperparams({'n_estimators': 100, 'criterion': 'gini', 'max_depth': None})

In [None]:
bayes_trials = Trials()

# Optimize
# best = fmin(fn = RF_objective_hyperparams, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = bayes_trials)

In [None]:
# best_params = space_eval(space, best)
best_params = {'criterion': 'entropy', 'max_depth': None, 'n_estimators': 274}
best_params

In [None]:
X = feature_df[top_5_features]
y = feature_df['label']
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    cls = RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        criterion=best_params['criterion'],
        random_state=1
    )
    
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Tuned Random Forest on top 5 features')

# 5. Other experiments <a class="anchor" id="section_5"></a>

## 5.1 Using raw data + smaller window size <a class="anchor" id="section_5_1"></a>

### With original window

In [None]:
X = raw_data[x_acc_cols+y_acc_cols+z_acc_cols]
y = raw_data['label']
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    cls = RandomForestClassifier()
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'Random Forest on raw data')

In [None]:
from sklearn import svm

In [None]:
X = raw_data[x_acc_cols+y_acc_cols+z_acc_cols]
y = raw_data['label']
cv_metrics = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    cls = svm.SVC()
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    
    fold_metrics = evaluate_performance(y_valid, y_pred)
    cv_metrics.append(fold_metrics)
    print(fold_metrics['CM_not_normalized'])
print_cv_metrics(cv_metrics, 'SVM (rbf) on raw data')

### Switch to binary classification

Lets work with binary classification (stairs vs walking) for simplicity. And setup baseline on raw data.

In [None]:
raw_stairs = raw_data[raw_data['label'].isin(['stairs', 'walking'])].reset_index(drop=True).copy()
raw_stairs['is_stairs'] = raw_stairs['label'] == 'stairs'
raw_stairs.shape

In [None]:
X = raw_stairs[x_acc_cols+y_acc_cols+z_acc_cols]
y = raw_stairs['is_stairs']
cv_metrics = []
f_scores = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    cls = svm.SVC()
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_valid)
    f_scores.append(f1_score(y_valid, y_pred))
print(f'F1 score mean: {round(np.mean(f_scores), 2)}, std: {round(np.std(f_scores), 2)}')

### Split to smaller window, then assemble predictions

In [None]:
def split_frames_to_windows(X, window_size = 20):
    """Create windows of smaller size from batch of timeseries"""
    frame_length = X.shape[-1]
    n_windows = frame_length - window_size + 1
    i = 0

    X_extended = []
    for i in range(0, n_windows):
        X_extended.append(X[:, :, i:window_size+i])

    X_extended = np.vstack(X_extended)
    return X_extended

In [None]:
np.random.seed(1)
sample_X = np.random.randint(6, size=(1,3,4))
sample_X

In [None]:
sample_windows = split_frames_to_windows(sample_X, window_size=3)
assert len(sample_windows)      == 2 # we expect to get 2 windows
assert sample_windows.shape[1:] == (3,3) # we expect each window to be 3x3

sample_windows

In [None]:
def assemble_ts_parts(X):
    x_part = X[x_acc_cols].values
    y_part = X[y_acc_cols].values
    z_part = X[z_acc_cols].values
    features = np.stack([x_part, y_part, z_part], axis=1)
    
    return features

In [None]:
def fold_predictions(y_pred, n_windows):
    y_pred = y_pred.reshape(n_windows, -1)
    return y_pred.mean(axis=0)

sample_preds = np.array([1, 0, 1, 1, 0, 0])
expected_preds = np.array([1., 0., .5])

actual_preds = fold_predictions(sample_preds, n_windows=2)

assert np.allclose(actual_preds, expected_preds)

In [None]:
window_size = 29
threshold = .5
n_windows = FRAME_LENGTH - window_size + 1

X = raw_stairs[x_acc_cols+y_acc_cols+z_acc_cols]
y = raw_stairs['is_stairs']

f_scores = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    ts_frames = assemble_ts_parts(X_train)
    X_train_extended = split_frames_to_windows(ts_frames, window_size=window_size)
    X_train_extended = X_train_extended.reshape(len(X_train_extended), -1)
    y_train_extended = np.tile(y_train, n_windows)
    
    cls = svm.SVC()
    cls.fit(X_train_extended, y_train_extended)
    X_valid_extended = assemble_ts_parts(X_valid)
    X_valid_extended = split_frames_to_windows(X_valid_extended, window_size=window_size)
    X_valid_extended = X_valid_extended.reshape(len(X_valid_extended), -1)
    
    y_pred_extended = cls.predict(X_valid_extended)
    y_pred = fold_predictions(y_pred_extended, n_windows=n_windows)
    y_pred = y_pred > threshold
    f_scores.append(f1_score(y_valid, y_pred))
    
print(f'F1 score mean: {round(np.mean(f_scores), 2)}, std: {round(np.std(f_scores), 2)}')

In [None]:
window_size = 20
threshold = .5
n_windows = FRAME_LENGTH - window_size + 1

X = raw_stairs[x_acc_cols+y_acc_cols+z_acc_cols]
y = raw_stairs['is_stairs']

f_scores = []

for train_ix, valid_ix in kfold.split(X, y):
    X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
    y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]
    
    ts_frames = assemble_ts_parts(X_train)
    X_train_extended = split_frames_to_windows(ts_frames, window_size=window_size)
    X_train_extended = X_train_extended.reshape(len(X_train_extended), -1)
    y_train_extended = np.tile(y_train, n_windows)
    
    cls = svm.SVC()
    cls.fit(X_train_extended, y_train_extended)
    X_valid_extended = assemble_ts_parts(X_valid)
    X_valid_extended = split_frames_to_windows(X_valid_extended, window_size=window_size)
    X_valid_extended = X_valid_extended.reshape(len(X_valid_extended), -1)
    
    y_pred_extended = cls.predict(X_valid_extended)
    y_pred = fold_predictions(y_pred_extended, n_windows=n_windows)
    y_pred = y_pred > threshold
    f_scores.append(f1_score(y_valid, y_pred))
    
print(f'F1 score mean: {round(np.mean(f_scores), 2)}, std: {round(np.std(f_scores), 2)}')

In [None]:
def get_SVM_F1_on_window_size(window_size):
    threshold = .5
    n_windows = FRAME_LENGTH - window_size + 1

    X = raw_stairs[x_acc_cols+y_acc_cols+z_acc_cols]
    y = raw_stairs['is_stairs']

    f_scores = []

    for train_ix, valid_ix in kfold.split(X, y):
        X_train, X_valid = X.loc[train_ix], X.loc[valid_ix]
        y_train, y_valid = y.loc[train_ix], y.loc[valid_ix]

        ts_frames = assemble_ts_parts(X_train)
        X_train_extended = split_frames_to_windows(ts_frames, window_size=window_size)
        X_train_extended = X_train_extended.reshape(len(X_train_extended), -1)
        y_train_extended = np.tile(y_train, n_windows)

        cls = svm.SVC()
        cls.fit(X_train_extended, y_train_extended)
        X_valid_extended = assemble_ts_parts(X_valid)
        X_valid_extended = split_frames_to_windows(X_valid_extended, window_size=window_size)
        X_valid_extended = X_valid_extended.reshape(len(X_valid_extended), -1)

        y_pred_extended = cls.predict(X_valid_extended)
        y_pred = fold_predictions(y_pred_extended, n_windows=n_windows)
        y_pred = y_pred > threshold
        f_scores.append(f1_score(y_valid, y_pred))

    return f_scores


In [None]:
get_SVM_F1_on_window_size(30)

In [None]:
window_sizes = np.arange(30, 9, -1)
f_scores_means = []
f_scores_stds  = []
 
for window_size in tqdm(window_sizes):
    f_scores_cv = get_SVM_F1_on_window_size(window_size)
    f_scores_means.append(np.mean(f_scores_cv))
    f_scores_stds.append(np.std(f_scores_cv))    

In [None]:
plt.figure(figsize=(14,6))
plt.title('Binary classification performance (stairs vs walking) on raw data')
plt.xlabel('Window size');
plt.ylabel('F1 score')
sns.lineplot(x=window_sizes, y=f_scores_means)
sns.scatterplot(x=window_sizes, y=f_scores_means, s=100);
plt.errorbar(window_sizes, f_scores_means, f_scores_stds, linestyle='None')

## 5.2 Median filter visualization <a class="anchor" id="section_5_2"></a>

In [None]:
from scipy import signal as sig

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 20))
axes = axes.flatten()


steps = np.arange(30)

for ax, label in zip(axes, classes):
    sample = raw_data[raw_data['label'] == label].iloc[0]
    sns.lineplot(y=sample[x_acc_cols].astype(float), color='g', x=steps, alpha=.3, label='X', ax=ax)
    sns.lineplot(y=sample[y_acc_cols].astype(float), color='r', x=steps, alpha=.3, label='Y', ax=ax)
    sns.lineplot(y=sample[z_acc_cols].astype(float), color='b', x=steps, alpha=.3, label='Z', ax=ax).set_title(label)
    
    sns.lineplot(y=sig.medfilt(sample[x_acc_cols].astype(float), kernel_size=7), x=steps, label='medfilt7 X', color='g', ax=ax)
    sns.lineplot(y=sig.medfilt(sample[y_acc_cols].astype(float), kernel_size=7), x=steps, label='medfilt7 Y', color='r', ax=ax)
    sns.lineplot(y=sig.medfilt(sample[z_acc_cols].astype(float), kernel_size=7), x=steps, label='medfilt7 Z', color='b', ax=ax)
    
    ax.set_ylim(-40, 40)
    ax.set_xlabel('time step')
    ax.set_ylabel('m/$s^2$')

We can see, that median filter can be used for denoizing signal from accelerometer and provide great source for futher feature engineering