In [1]:
import pandas as pd
import numpy as np

# Feature Extraction From the Time Series

In [2]:
df = pd.read_csv("raw_deep_learning_data.csv")
raw_data = [0 for x in range(df.iloc[-1]["id"]+1)]
df.head()

Unnamed: 0,id,gesture,raw_ax,raw_ay,raw_az,pitch,roll
0,0,A,2.373524,-9.51134,1.56321,-10.2797,-82.49077
1,0,A,2.627309,-10.11468,1.706863,-11.22794,-82.40876
2,0,A,3.048688,-9.805827,1.989378,-12.14,-82.24461
3,0,A,3.438943,-9.530494,2.451459,-12.95901,-82.03904
4,0,A,3.652027,-9.669357,2.949453,-13.72182,-81.82306


In [3]:
len(raw_data)

120

In [4]:
for index, row in df.iterrows():

    if raw_data[row["id"]] == 0:
        raw_data[row["id"]] = [row.to_list()]
    else:
        raw_data[row["id"]].append(row.to_list())


raw_data[0][:10]

[[0, 'A', 2.373524, -9.51134, 1.56321, -10.2797, -82.49077],
 [0, 'A', 2.627309, -10.11468, 1.706863, -11.22794, -82.40876],
 [0, 'A', 3.048688, -9.805827, 1.989378, -12.14, -82.24461],
 [0, 'A', 3.438943, -9.530494, 2.451459, -12.95901, -82.03904],
 [0, 'A', 3.652027, -9.669357, 2.949453, -13.72182, -81.82306],
 [0, 'A', 3.822016, -9.564013, 3.344497, -14.51854, -81.60957],
 [0, 'A', 3.836381, -9.877653, 3.686867, -15.28141, -81.41567],
 [0, 'A', 3.541894, -10.35889, 4.723557, -15.88499, -81.21661],
 [0, 'A', 3.204312, -11.02926, 5.906292, -16.42527, -80.95861],
 [0, 'A', 3.192341, -11.5512, 6.698773, -17.00583, -80.57973]]

In [5]:
def mean_abs_difference(x):
    mean = np.mean(x)
    return np.mean(np.abs(x-mean))

In [6]:
def magnitude(ax, ay, az):
    return np.sqrt(ax**2 + ay**2 + az**2)

In [7]:
def energy(x):
    return np.sum(x**2)

In [8]:
from scipy.fft import rfft, rfftfreq

def dominant_frequency(x, hz):
    x = x - np.mean(x)
    fft_vals = np.abs(rfft(x))
    freqs = rfftfreq(len(x), 1/hz)
    return freqs[np.argmax(fft_vals[1:]) + 1]

In [9]:
def corr(x, y):
    return np.corrcoef(x, y)[0, 1]

In [10]:
def orientation_features(x):
    return {
        "mean": np.mean(x),
        "std": np.std(x),
        "range": np.max(x) - np.min(x),
        "mad": mean_abs_difference(x)
    }


In [11]:
def accelerometer_features(x, hz):
    return {
        "mean": np.mean(x),
        "std": np.std(x),
        "energy": energy(x),
        "mad": mean_abs_difference(x),
        "dom_freq": dominant_frequency(x, hz)
    }

In [12]:
def extract_features(ax, ay, az, roll, pitch, hz):
    feats = {}

    for name, data in zip(["ax", "ay", "az"], [ax, ay, az]):
        f = accelerometer_features(data, hz)
        for k,v in f.items():
            feats[f"{k}_{name}"] = v

    for name, data in zip(["pitch", "roll"], [pitch, roll]):
        f = orientation_features(data)
        for k,v in f.items():
            feats[f"{k}_{name}"] = v

    mag = magnitude(ax, ay, az)
    feats["mean_mag"] = np.mean(mag)
    feats["std_mag"] = np.std(mag)
    feats["energy_mag"] = energy(mag)

    feats["corr_ax_ay"] = corr(ax, ay)
    feats["corr_ay_az"] = corr(ay, az)
    feats["corr_ax_az"] = corr(ax, az)
    feats["corr_roll_pitch"] = corr(roll, pitch)

    feats["num_samples"] = len(ax)

    return feats

In [13]:
X = pd.DataFrame(columns=[
    # Accelerometer (dynamic)
    "mean_ax", "std_ax", "energy_ax", "mad_ax", "dom_freq_ax",
    "mean_ay", "std_ay", "energy_ay", "mad_ay", "dom_freq_ay",
    "mean_az", "std_az", "energy_az", "mad_az", "dom_freq_az",

    # Orientation (smooth)
    "mean_roll", "std_roll", "range_roll", "mad_roll",
    "mean_pitch", "std_pitch", "range_pitch", "mad_pitch",

    # Magnitude
    "mean_mag", "std_mag", "energy_mag",

    # Correlations
    "corr_ax_ay", "corr_ay_az", "corr_ax_az",
    "corr_roll_pitch",

    # Length
    "num_samples"
])

Y = []

for data in raw_data:
    # columns go like ax, ay, az, pitch, roll
    ax = []
    ay = []
    az = []
    pitch = []
    roll = []
    for row in data:
        ax.append(row[2])
        ay.append(row[3])
        az.append(row[4])
        pitch.append(row[5])
        roll.append(row[6])
    ax= np.array(ax)
    ay = np.array(ay)
    az = np.array(az)
    pitch = np.array(pitch)
    roll = np.array(roll)
    feats = extract_features(ax, ay, az, roll, pitch, 100)
    X = pd.concat([X, pd.DataFrame([feats])], ignore_index=True)
    Y.append(data[0][1])


print(Y)
X

['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B']


Unnamed: 0,mean_ax,std_ax,energy_ax,mad_ax,dom_freq_ax,mean_ay,std_ay,energy_ay,mad_ay,dom_freq_ay,...,range_pitch,mad_pitch,mean_mag,std_mag,energy_mag,corr_ax_ay,corr_ay_az,corr_ax_az,corr_roll_pitch,num_samples
0,2.624217,3.270584,1687.990556,2.794825,2.083333,-10.544538,2.10876,11100.878933,1.761517,3.125,...,14.47737,3.497665,12.572289,2.595537,15820.728247,-0.224467,-0.825769,0.17593,0.293864,96
1,3.021879,2.985843,1552.042738,2.624541,2.325581,-10.422028,2.254576,9778.352586,1.831099,2.325581,...,10.19197,2.611739,11.932576,2.534736,12797.767394,-0.267864,-0.819482,0.160877,-0.049971,86
2,3.275029,2.385653,1559.629817,2.097158,2.105263,-10.133152,1.852248,10080.601896,1.51756,2.105263,...,12.32607,2.647815,11.408323,2.358666,12892.747913,-0.33886,-0.853747,0.306315,0.097698,95
3,3.57215,2.018031,1716.935974,1.79559,1.960784,-10.177163,2.125607,11025.469782,1.847573,1.960784,...,9.22315,2.362446,11.471429,2.540747,14081.005489,-0.177014,-0.899021,0.160158,0.291953,102
4,3.911886,2.668137,2130.071777,2.384741,2.105263,-10.286784,1.962749,10418.679557,1.578904,2.105263,...,15.06679,3.104269,11.733839,2.182891,13532.558542,-0.31625,-0.85952,0.355742,0.18494,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,4.985386,5.203824,3531.50282,4.478247,1.470588,-3.110794,4.064344,1781.323211,3.495812,1.470588,...,14.88328,4.290215,10.601822,7.515011,11483.433722,-0.227906,-0.867462,0.500293,0.118882,68
116,6.209743,4.507199,4180.17821,3.87068,2.816901,-4.055798,3.190497,1890.6425,2.772639,1.408451,...,13.94862,3.363284,11.631332,4.782573,11229.422778,-0.075672,-0.682704,0.322441,0.355692,71
117,6.693807,5.858264,5301.462546,5.084698,2.985075,-4.207111,3.532083,2021.751812,2.992808,1.492537,...,16.68397,3.699883,12.152679,6.12242,12406.499017,0.040083,-0.71621,0.262761,0.419859,67
118,6.029043,5.820994,5127.033345,5.255977,2.739726,-4.044099,3.864809,2284.27813,3.46898,1.369863,...,16.1573,3.711547,11.476241,6.501973,12700.522831,-0.304035,-0.807622,0.677822,-0.390902,73


In [14]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rf = RandomForestClassifier()

In [32]:
rf.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [33]:
y_pred = rf.predict(X_test)

In [34]:
rf.score(X_test, y_test)

1.0

In [39]:
from sklearn.metrics import classification_report
y_pred = rf.predict(X_test)

In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           A       1.00      1.00      1.00        15
           B       1.00      1.00      1.00         9

    accuracy                           1.00        24
   macro avg       1.00      1.00      1.00        24
weighted avg       1.00      1.00      1.00        24



In [43]:
features = pd.DataFrame(rf.feature_importances_, index=X.columns)
features

Unnamed: 0,0
mean_ax,0.0
std_ax,0.081176
energy_ax,0.0
mad_ax,0.07
dom_freq_ax,0.009886
mean_ay,0.05286
std_ay,0.09
energy_ay,0.11
mad_ay,0.02
dom_freq_ay,0.030416


# Hyper Parameters

In [None]:
rf2 = RandomForestClassifier()

In [35]:
import joblib

joblib.dump(rf, "model.pkl")

['model.pkl']