In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
import numpy as np
import pandas as pd
import os

In [2]:
def getFiles(file_list, which):
    datapath = "./../2022"
    for root, dirs, files in os.walk(datapath):
        for file in files:
            if ('csv' in file and not ("Movement" in file) and (file[0] == which)):
                file_list.append(os.path.join(root, file))
                
Thingy_list = []
Respect_list = []

getFiles(Thingy_list, 'T')
getFiles(Respect_list, 'R')

print(len(Thingy_list), len(Respect_list))

1014 1014


In [3]:
def all_csv_to_dataframe(dataframe, thingy_list, respect_list):
    for i in range(len(thingy_list)):
        thingy_file = thingy_list[i]
        respect_file = respect_list[i]
        if thingy_file[25:43] == respect_file[26:44]:
            
            new_thingy = pd.read_csv(thingy_file)
            new_thingy = new_thingy.rename(columns=
                             {'accel_x'      :'T_accel_x',
                              'accel_y'      :'T_accel_y',
                              'accel_z'      :'T_accel_z',
                              'gyro_x'       :'T_gyro_x' ,
                              'gyro_y'       :'T_gyro_y' ,
                              'gyro_z'       :'T_gyro_z' ,
                              'activity_code':'T_activity_code'})
            new_respect = pd.read_csv(respect_file)
            new_respect = new_respect.rename(columns=
                             {'accel_x'      :'R_accel_x',
                              'accel_y'      :'R_accel_y',
                              'accel_z'      :'R_accel_z',
                              'gyro_x'       :'R_gyro_x' ,
                              'gyro_y'       :'R_gyro_y' ,
                              'gyro_z'       :'R_gyro_z' ,
                              'activity_code':'R_activity_code'})
            
            new_dataframe = pd.concat([new_thingy, new_respect], axis=1)
            dataframe = pd.concat([dataframe, new_dataframe])
            
    return dataframe

dataframe = pd.DataFrame()
dataframe = all_csv_to_dataframe(dataframe, Thingy_list, Respect_list)

In [4]:
# Somehow there are 6873 and 664 null activity code/type in thingy and respect
print(dataframe['T_activity_code'].isnull().sum(), dataframe['R_activity_code'].isnull().sum())
dataframe = dataframe.dropna()
print(dataframe['T_activity_code'].isnull().sum(), dataframe['R_activity_code'].isnull().sum())

print(pd.unique(dataframe['T_activity_code']))
print(pd.unique(dataframe['R_activity_code']))

6873 664
0 0
[ 12.  13.  31.   7.   2.   8.   6.  11.   5.   4.   0. 100.   1.]
[ 12.  13.  31.   7.   2.   8.   6.  11.   5.   4.   0. 100.   1.]


In [5]:
# Proof of all Thingy acitivy code are the same as all Respect activity code
not_same = dataframe['T_activity_code'] != dataframe['R_activity_code']
np.where(not_same)

(array([], dtype=int64),)

In [6]:
def create_dataset(dataframe, data_columns, label_column, time_steps=1, step=1):
    XX, YY = [], []
    raw_x = dataframe[data_columns]
    raw_y = dataframe[label_column]
    
    for i in range(0, len(raw_x) - time_steps, step):
        vv = raw_x.iloc[i : (i + time_steps)].values
        labels = raw_y.iloc[i : i + time_steps]
        XX.append(vv)
        YY.append(stats.mode(labels)[0][0])
        
    X = np.array(XX)
    Y = np.array(YY).reshape(-1, 1)
    return X, Y

Thingy_columns = ['T_accel_x', 'T_accel_y', 'T_accel_z', 'T_gyro_x', 'T_gyro_y', 'T_gyro_z', 'mag_x', 'mag_y', 'mag_z']
Thingy_label = ['T_activity_code']
Respect_columns = ['R_accel_x', 'R_accel_y', 'R_accel_z', 'R_gyro_x', 'R_gyro_y', 'R_gyro_z']
Respect_label = ['R_activity_code']

In [7]:
# Thingy data only
X_Thingy, y_Thingy = create_dataset(dataframe, Thingy_columns, Thingy_label, 50, 10)
X_train_Thingy, X_test_Thingy, y_train_Thingy, y_test_Thingy = train_test_split(X_Thingy, y_Thingy, random_state=111)

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoder = encoder.fit(y_train_Thingy)
y_train_Thingy = encoder.transform(y_train_Thingy)
y_test_Thingy  = encoder.transform(y_test_Thingy)
print(X_train_Thingy.shape, X_test_Thingy.shape, y_train_Thingy.shape, y_test_Thingy.shape)

(13747, 50, 9) (4583, 50, 9) (13747, 13) (4583, 13)


In [8]:
# Respect data only
X_Respect, y_Respect = create_dataset(dataframe, Respect_columns, Respect_label, 50, 10)
X_train_Respect, X_test_Respect, y_train_Respect, y_test_Respect = train_test_split(X_Respect, y_Respect, random_state=111)

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoder = encoder.fit(y_train_Respect)
y_train_Respect = encoder.transform(y_train_Respect)
y_test_Respect  = encoder.transform(y_test_Respect)
print(X_train_Respect.shape, X_test_Respect.shape, y_train_Respect.shape, y_test_Respect.shape)

(13747, 50, 6) (4583, 50, 6) (13747, 13) (4583, 13)


In [9]:
# All data
X, y = create_dataset(dataframe, Thingy_columns + Respect_columns, Thingy_label, 50, 10)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=111)

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoder = encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test  = encoder.transform(y_test)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(13747, 50, 15) (4583, 50, 15) (13747, 13) (4583, 13)
