In [1]:
import pandas as pd
import os
from tqdm.notebook import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
file = open('MobiAct_Dataset_v2.0/Readme.txt', 'r', encoding='latin1')
strings = file.readlines()
file.close()
for s in strings:
    print(s)



The NEW version of the MobiAct dataset includes:

 	Four different types of falls performed by 66 participants

 	Eleven different types of ADLs performed by 19 participants and nine types of ADLs performed by 59 participants (plus one activity "LYI" which results from the inactivity period after a fall by 66 participants)

 	Five sub-scenarios which construct one scenario of daily living, which consists of a sequence of 50 activities and performed by 19 participants.



The new released version of the MobiAct dataset includes:

  The raw recorded data in txt format, separated by each activity

  The annotated data in csv format, separated by each activity





Filename format:

<ADL OR FALL OR SCENARIO_CODE>_<SENSOR_CODE>_<SUBJECT_ID>_<TRIAL_NO>.txt



examples:

1 -->	WAL_acc_5_1.txt

2 -->	STD_ori_9_5.txt

3 -->	FKL_gyro_3_2.txt

4 -->	SRH_acc_1_1.txt





Subjects:

+------+---------+-----------+-------+----------+----------+----------+

|  ID  |  Name   |  Surname  |  Age  

In [3]:
person_list = []
for s in strings:
    if 'sub' in s and '|' in s:
        temp = s.split('|')
        temp = [x.strip() for x in temp]
        if len(temp) == 9:
            person_list.append(temp[3:-1])

activity_list = []
for s in strings:
    if '|' in s:
        temp = s.split('|')
        temp = [x.strip() for x in temp]
        if len(temp) == 8:
            activity_list.append(temp[1:-1])
activity_list

[['No.', 'Label', 'Activity', 'Trials', 'Duration', 'Description'],
 ['1', 'STD', 'Standing', '1', '5min', 'Standing with subtle movements'],
 ['2', 'WAL', 'Walking', '1', '5min', 'Normal walking'],
 ['3', 'JOG', 'Jogging', '3', '30s', 'Jogging'],
 ['4', 'JUM', 'Jumping', '3', '30s', 'Continuous jumping'],
 ['5', 'STU', 'Stairs up', '6', '10s', 'Stairs up (10 stairs)'],
 ['6', 'STN', 'Stairs down', '6', '10s', 'Stairs down (10 stairs)'],
 ['7',
  'SCH',
  'Stand to sit(sit on chair)',
  '6',
  '6s',
  'Transition from standing to sitting'],
 ['8',
  'SIT',
  'Sitting on chair',
  '1',
  '1min',
  'Sitting on a chair with subtle movements'],
 ['9',
  'CHU',
  'Sit to stand(chair up)',
  '6',
  '6s',
  'Transition from sitting to standing'],
 ['10', 'CSI', 'Car-step in', '6', '6s', 'Step in a car'],
 ['11', 'CSO', 'Car-step out', '6', '6s', 'Step out a car'],
 ['12',
  'LYI',
  'Lying',
  '12',
  '-',
  'Activity taken from the lying period after a fall'],
 ['No.', 'Label', 'Activity', '

In [4]:
falls = ['FOL', 'FKL', 'BSC', 'SDL']

columns = ['name', 'age', 'height', 'weight', 'gender']
person_info = pd.DataFrame(person_list, columns=columns)

activity_info = pd.DataFrame(activity_list)
activity_info.columns = activity_info.iloc[0]
activity_info = activity_info.drop(0)
activity_info = activity_info.drop(13)
activity_info = activity_info.reset_index(drop=True)
# print(activity_info.columns)
index = activity_info['No.']
activity_info = activity_info.drop(['No.'], axis=1)
activity_info.index = index
activity_info['label_encoded'] = list(range(len(activity_info)))


activity_info

Unnamed: 0_level_0,Label,Activity,Trials,Duration,Description,label_encoded
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,STD,Standing,1,5min,Standing with subtle movements,0
2,WAL,Walking,1,5min,Normal walking,1
3,JOG,Jogging,3,30s,Jogging,2
4,JUM,Jumping,3,30s,Continuous jumping,3
5,STU,Stairs up,6,10s,Stairs up (10 stairs),4
6,STN,Stairs down,6,10s,Stairs down (10 stairs),5
7,SCH,Stand to sit(sit on chair),6,6s,Transition from standing to sitting,6
8,SIT,Sitting on chair,1,1min,Sitting on a chair with subtle movements,7
9,CHU,Sit to stand(chair up),6,6s,Transition from sitting to standing,8
10,CSI,Car-step in,6,6s,Step in a car,9


In [5]:
activity_info.to_csv('mobiact_preprocessed/activity_info.csv')

In [6]:
data_dir = 'MobiAct_Dataset_v2.0/Annotated Data/'
act_list = os.listdir(data_dir)
print(act_list)

['CHU', 'FOL', 'CSI', '.DS_Store', 'JOG', 'CSO', 'SBE', 'WAL', 'SBW', 'SLW', 'SIT', 'SCH', 'FKL', 'SRH', 'STD', 'BSC', 'JUM', 'SDL', 'SLH', 'STN', 'STU']


In [7]:
valid_labels = set(activity_info['Label'].unique())
valid_labels

{'BSC',
 'CHU',
 'CSI',
 'CSO',
 'FKL',
 'FOL',
 'JOG',
 'JUM',
 'LYI',
 'SCH',
 'SDL',
 'SIT',
 'STD',
 'STN',
 'STU',
 'WAL'}

In [8]:
train = []
valid = []
test = []

for act in tqdm(act_list):
    act_path = os.path.join(data_dir, act)
    if os.path.isdir(act_path):
        file_list = os.listdir(data_dir + act + '/')
        for file in file_list:
            file_names = file.split('_')
            label = file_names[0]

            data = pd.read_csv(data_dir + act + '/' + file)
            person_num = int(file_names[1])

            # Create a dictionary for this row of data
            row_dict = data.to_dict('records')[0]  # Assuming you want the first row of the CSV
            row_dict['label'] = label  # Add label to the dictionary
            row_dict['person_id'] = person_num  # Add person_id to the dictionary

            # Assign the dictionary to the appropriate list based on person_num
            if person_num <= 50:
                train.append(row_dict)
            elif 50 < person_num <= 53:
                valid.append(row_dict)
            else:
                test.append(row_dict)

  0%|          | 0/21 [00:00<?, ?it/s]

In [12]:
# Convert lists of dictionaries to DataFrames
train_df = pd.DataFrame(train)
valid_df = pd.DataFrame(valid)
test_df = pd.DataFrame(test)

# Save DataFrames to CSV
train_df.to_csv('mobiact_preprocessed/train.csv', index=False)
valid_df.to_csv('mobiact_preprocessed/valid.csv', index=False)
test_df.to_csv('mobiact_preprocessed/test.csv', index=False)

train_df

Unnamed: 0,timestamp,rel_time,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,person_id
0,4122885402000,0.0,3.238091,8.132329,4.245767,0.049480,-0.007636,-0.013744,273.736700,-67.593300,16.014387,CHU,6
1,12925658495000,0.0,5.644234,2.868331,7.323824,-0.029932,0.016188,0.014661,242.944400,-18.781696,17.408410,CHU,45
2,1339017530000,0.0,-2.509551,-2.413142,8.886849,-0.035125,0.003665,0.009163,113.365820,16.452131,-17.527372,CHU,1
3,12818065468000,0.0,6.633194,2.808050,6.496608,-0.032681,0.008552,0.011606,289.385380,-53.364044,23.002990,CHU,45
4,3819654863000,0.0,4.071569,6.147542,6.234699,0.028711,0.029322,0.067501,305.671360,-38.619170,29.405220,CHU,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2308,132077803519000,0.0,2.767958,9.498256,-0.876147,-0.036041,-0.051007,-0.004887,139.363980,-79.700560,9.542451,STU,23
2309,4711383709000,0.0,0.691561,9.836599,-0.390619,-0.680809,0.287107,0.085521,32.921436,-69.683150,18.517849,STU,7
2310,120363151803000,0.0,6.169228,7.615893,-1.150099,0.036652,-0.073915,0.015882,299.075160,-5.150481,21.856638,STU,24
2311,14123434217000,0.0,-0.732315,9.832258,-0.218083,-0.046120,-0.051007,0.015272,145.627820,-96.355840,-0.153220,STU,47


In [10]:
train

[{'timestamp': 4122885402000,
  'rel_time': 0.0,
  'acc_x': 3.23809065585065,
  'acc_y': 8.13232949215994,
  'acc_z': 4.24576737406899,
  'gyro_x': 0.049480084,
  'gyro_y': -0.0076358155,
  'gyro_z': -0.013744468,
  'azimuth': 273.7367,
  'pitch': -67.5933,
  'roll': 16.014387,
  'label': 'CHU',
  'person_id': 6},
 {'timestamp': 12925658495000,
  'rel_time': 0.0,
  'acc_x': 5.64423377684771,
  'acc_y': 2.86833107852183,
  'acc_z': 7.32382413614308,
  'gyro_x': -0.029932396,
  'gyro_y': 0.016187929,
  'gyro_z': 0.014660766,
  'azimuth': 242.9444,
  'pitch': -18.781696,
  'roll': 17.40841,
  'label': 'CHU',
  'person_id': 45},
 {'timestamp': 1339017530000,
  'rel_time': 0.0,
  'acc_x': -2.50955109935825,
  'acc_y': -2.41314150143763,
  'acc_z': 8.88684919394123,
  'gyro_x': -0.035124753,
  'gyro_y': 0.0036651916,
  'gyro_z': 0.009162978,
  'azimuth': 113.36582,
  'pitch': 16.452131,
  'roll': -17.527372,
  'label': 'CHU',
  'person_id': 1},
 {'timestamp': 12818065468000,
  'rel_time': 0.