# Setup

In [55]:
import os
import copy
import re

# Progress bar
from tqdm import tqdm

# Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [56]:
project_abs_path = "/Users/fujiwaraseita/Desktop/microbit_remote_control_ML/Machine-Learning/"

train_raw = pd.read_csv(project_abs_path + 'train_raw.csv')
train_fft =  pd.read_csv(project_abs_path + 'train_fft.csv')
train_median_fft = pd.read_csv(project_abs_path + 'train_median_fft.csv')

In [57]:
train_raw.head()

Unnamed: 0,Timestamp,Gesture,Id,X,Y,Z,Up,Down,Left,Right,Shuffle,Circle
0,20241103010152562,circle,1,-0.099415,0.278638,-0.139601,0,0,0,0,0,1
1,20241103010152577,circle,1,-0.115789,0.250774,-0.245014,0,0,0,0,0,1
2,20241103010152592,circle,1,-0.120468,0.229102,-0.367521,0,0,0,0,0,1
3,20241103010152622,circle,1,-0.122807,0.226006,-0.310541,0,0,0,0,0,1
4,20241103010152638,circle,1,-0.120468,0.266254,-0.182336,0,0,0,0,0,1


In [44]:
train_fft.head()

Unnamed: 0,Timestamp,Gesture,Id,X_fft,Y_fft,Z_fft,Up,Down,Left,Right,Shuffle,Circle
0,20241103010152562,circle,1,1.582456,1.513932,19.717949,0,0,0,0,0,1
1,20241103010152562,circle,1,5.165393,9.511628,4.713491,0,0,0,0,0,1
2,20241103010152562,circle,1,11.614201,4.659314,11.994529,0,0,0,0,0,1
3,20241103010152562,circle,1,15.675558,10.471815,16.073472,0,0,0,0,0,1
4,20241103010152562,circle,1,7.473096,5.821593,8.57157,0,0,0,0,0,1


In [None]:
# train_median_fft.head()

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# データの集約
agg_funcs = {
    'X': ['mean', 'std'],
    'Y': ['mean', 'std'],
    'Z': ['mean', 'std']
}
train_agg = train_raw.groupby(['Id', 'Gesture']).agg(agg_funcs).reset_index()
train_agg.columns = ['Id', 'Gesture'] + [f'{axis}_{stat}' for axis in ['X', 'Y', 'Z'] for stat in ['mean', 'std']]

In [63]:
train_agg

Unnamed: 0,Id,Gesture,X_mean,X_std,Y_mean,Y_std,Z_mean,Z_std
0,1,circle,-0.021099,0.456943,0.020186,0.365891,-0.262906,0.454169
1,1,down,0.183738,0.349831,0.446610,0.434055,-0.164373,0.392457
2,1,left,-0.089775,0.493994,0.226242,0.489834,0.043228,0.388637
3,1,right,-0.001169,0.385533,0.022840,0.414084,0.213518,0.269150
4,1,shuffle,0.035126,0.741736,-0.068123,0.756695,-0.077475,0.450909
...,...,...,...,...,...,...,...,...
195,35,circle,0.181174,0.410598,0.206438,0.429039,-0.116005,0.488156
196,35,down,-0.270360,0.318114,0.142090,0.650610,-0.130385,0.322980
197,35,left,-0.081195,0.473189,0.431459,0.420488,0.129426,0.353115
198,35,right,0.128802,0.420944,0.262456,0.499406,0.136765,0.272278


In [65]:
# 特徴量とターゲットの分割
X = train_agg.drop(columns=['Id', 'Gesture'])
y = train_agg['Gesture']

# ハイパーパラメータの設定
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# グリッドサーチの設定
clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search.fit(X, y)

# 最適なハイパーパラメータと最良モデルの取得
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# クロスバリデーションで精度評価
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')  # 5分割クロスバリデーション
print(f'Cross-validation scores: {cv_scores}')
print(f'Average cross-validation accuracy: {cv_scores.mean():.2f}')

Fitting 3 folds for each of 324 candidates, totalling 972 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Cross-validation scores: [0.6   0.575 0.675 0.7   0.725]
Average cross-validation accuracy: 0.66


In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# データの集約
agg_funcs = {
    'X_fft': ['mean', 'std', 'max', 'min'],
    'Y_fft': ['mean', 'std', 'max', 'min'],
    'Z_fft': ['mean', 'std', 'max', 'min']
}
train_agg = train_median_fft.groupby(['Id', 'Gesture']).agg(agg_funcs).reset_index()
train_agg.columns = ['Id', 'Gesture'] + [f'{axis}_{stat}' for axis in ['X', 'Y', 'Z'] for stat in ['mean', 'std', 'max', 'min']]

# 特徴量とターゲットの分割
X = train_agg.drop(columns=['Id', 'Gesture'])
y = train_agg['Gesture']

# ハイパーパラメータの設定
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# グリッドサーチの設定
clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search.fit(X, y)

# 最適なハイパーパラメータと最良モデルの取得
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# クロスバリデーションで精度評価
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')  # 5分割クロスバリデーション
print(f'Cross-validation scores: {cv_scores}')
print(f'Average cross-validation accuracy: {cv_scores.mean():.2f}')

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# データの集約
agg_funcs = {
    'X_fft': ['mean', 'std', 'max', 'min'],
    'Y_fft': ['mean', 'std', 'max', 'min'],
    'Z_fft': ['mean', 'std', 'max', 'min']
}
train_agg = train_fft.groupby(['Id', 'Gesture']).agg(agg_funcs).reset_index()
train_agg.columns = ['Id', 'Gesture'] + [f'{axis}_{stat}' for axis in ['X', 'Y', 'Z'] for stat in ['mean', 'std', 'max', 'min']]

# 特徴量とターゲットの分割
X = train_agg.drop(columns=['Id', 'Gesture'])
y = train_agg['Gesture']

# ハイパーパラメータの設定
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# グリッドサーチの設定
clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search.fit(X, y)

# 最適なハイパーパラメータと最良モデルの取得
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# クロスバリデーションで精度評価
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')  # 5分割クロスバリデーション
print(f'Cross-validation scores: {cv_scores}')
print(f'Average cross-validation accuracy: {cv_scores.mean():.2f}')

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Cross-validation scores: [0.525 0.65  0.6   0.45  0.575]
Average cross-validation accuracy: 0.56


In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# データの集約
agg_funcs = {
    'X': ['mean', 'std', 'max', 'min'],
    'Y': ['mean', 'std', 'max', 'min'],
    'Z': ['mean', 'std', 'max', 'min']
}
train_agg = train_raw.groupby(['Id', 'Gesture']).agg(agg_funcs).reset_index()
train_agg.columns = ['Id', 'Gesture'] + [f'{axis}_{stat}' for axis in ['X', 'Y', 'Z'] for stat in ['mean', 'std', 'max', 'min']]

# 特徴量とターゲットの分割
X = train_agg.drop(columns=['Id', 'Gesture'])
y = train_agg['Gesture']

# ハイパーパラメータの設定
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# グリッドサーチの設定
clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search.fit(X, y)

# 最適なハイパーパラメータと最良モデルの取得
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# クロスバリデーションで精度評価
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')  # 5分割クロスバリデーション
print(f'Cross-validation scores: {cv_scores}')
print(f'Average cross-validation accuracy: {cv_scores.mean():.2f}')

Fitting 3 folds for each of 324 candidates, totalling 972 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Cross-validation scores: [0.65  0.6   0.725 0.7   0.65 ]
Average cross-validation accuracy: 0.66


# ~~Split data into train & test~~

Since I would like to have a few data for test from each labels, I split manually

In [3]:
# def split_data_into_train_and_test(df: pd.DataFrame, test_percentage:int=0.2)->(pd.DataFrame, pd.DataFrame):

#     train_df = df.copy()
#     test_df = pd.DataFrame()

#     # Get test data from each gesture
#     for gesture, group in df.groupby('Gesture'):
   
#         # Get unique Ids for the current gesture
#         unique_ids = group['Id'].unique()
        
#         # Calculate the number of Ids for test data
#         num_test_ids = int(len(unique_ids) * test_percentage)

#         # DEBUG:
#         # print(f"{gesture}:")
#         # print(f"Unique Id count: {unique_ids}")
#         # print(f"{num_test_ids}")
        
#         # Randomly select Ids for test data
#         test_ids = pd.Series(unique_ids).sample(n=num_test_ids, random_state=42).values
        
#         # Select rows for test_df where Id is in test_ids and Gesture matches
#         test_data = group[group['Id'].isin(test_ids)]
        
#         # Append selected test data to test_df
#         test_df = pd.concat([test_df, test_data], ignore_index=True)
        
#         # Drop the test data rows from train_df
#         train_df = train_df.drop(test_data.index)
        
            
#     return train_df, test_df

In [4]:
# train_fft_df, test_fft_df = split_data_into_train_and_test(train_fft)

# Avoid using train_test_split() here, as I'd like to ensure that 20% of the data is taken from each gesture.

# Reshaping 

In [5]:
def reshape_df(df: pd.DataFrame, max_rows: int, features_list: list=['X_fft', 'Y_fft', 'Z_fft'],
               label_col: str = 'Gesture') -> (np.ndarray, np.ndarray):
    """
    Reshape the dataframe into a 3D array with shape (total_samples, rows, features)
    and apply one-hot encoding to the labels.
    """
    # Perform one-hot encoding on the labels
    y_one_hot = pd.get_dummies(df[label_col]).values
    
    # Extract only the 3 desired features (default: X_fft, Y_fft, Z_fft)
    df_features = df[features_list]

    # Reshape the features into 3D (total_samples, rows, features)
    num_samples = len(df_features) // max_rows
    x_train = df_features.values[:num_samples * max_rows].reshape(num_samples, max_rows, -1)
    
    # Take the first label in each chunk of `rows` for the one-hot encoded labels
    y_labels = y_one_hot[::max_rows][:num_samples]
    y_labels = y_labels.astype(int)

    return x_train, y_labels

In [11]:
x_train, y_train_labels = reshape_df(train_fft_df, max_rows=50, label_col='Gesture')
x_test, y_test_labels = reshape_df(test_fft_df, max_rows=50, label_col='Gesture')

print("x_train shape:", x_train.shape)  # Ex: (all samples, data rows, features)
print("y_labels shape:", y_train_labels.shape)  # Ex: (all samples, class nums)

x_train shape: (158, 50, 3)
y_labels shape: (158, 6)


In [10]:
x_train[1]

array([[ 0.63838384,  2.54729012, 11.24875622],
       [ 1.23459413,  7.15121059,  0.63949973],
       [ 8.43683041,  4.77514875,  5.84300863],
       [14.64343271,  3.88728297,  9.97245426],
       [10.68222587,  5.13296409,  8.10417424],
       [ 4.35929634,  4.18627645,  4.18589874],
       [ 4.24279256,  3.93346787,  1.81924112],
       [ 2.50221128,  2.87008585,  2.1545214 ],
       [ 2.19112449,  1.412906  ,  1.30752663],
       [ 2.22878933,  2.64104898,  0.64709503],
       [ 2.63973587,  3.68167986,  2.14911574],
       [ 1.81286264,  3.52139255,  2.66425399],
       [ 0.80022991,  2.46465393,  0.76408093],
       [ 0.20611801,  1.27298195,  2.51047533],
       [ 0.53474318,  1.31164491,  1.43701894],
       [ 0.49243299,  2.02790708,  2.27560711],
       [ 1.49885951,  2.13980402,  2.54953372],
       [ 0.93668305,  0.28058807,  0.50402648],
       [ 0.79941771,  0.36863189,  1.74352285],
       [ 0.96093512,  0.99583692,  1.84161509],
       [ 0.76057072,  1.15053106,  1.705

In [30]:
# test
# 1.reshape
x_train, y_train = reshape_df(train_raw, max_rows=50, features_list=["X", "Y", "Z"], label_col='Gesture')

x_train = x_train.reshape(x_train.shape[0], -1)

# データをトレーニングセットとテストセットに分割
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Random Forest

In [12]:
# convert x_train to 2D (Ex: (158, 50, 3) -> (158, 150))
x_train_flat = x_train.reshape(x_train.shape[0], -1)
x_test_flat = x_test.reshape(x_test.shape[0], -1)

In [None]:
x_train_flat

In [20]:
# ランダムフォレストモデルを初期化
clf = RandomForestClassifier(n_estimators=50, random_state=42)

# モデルをトレーニング
clf.fit(x_train_flat, y_train_labels)

# テストデータで予測
y_pred = clf.predict(x_test_flat)

# 精度を評価
accuracy = accuracy_score(y_test_labels, y_pred)
report = classification_report(y_test_labels, y_pred)

print("テストセットでの精度:", accuracy)
print("分類レポート:\n", report)

テストセットでの精度: 0.1794871794871795
分類レポート:
               precision    recall  f1-score   support

           0       0.35      1.00      0.52         7
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00         6

   micro avg       0.28      0.18      0.22        39
   macro avg       0.06      0.17      0.09        39
weighted avg       0.06      0.18      0.09        39
 samples avg       0.18      0.18      0.18        39



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
