In [2]:
import pandas as pd
import os

In [3]:
observables = ['acceleration16', 'acceleration6', 'gyroscope', 'magnetometer']
sensor = ['temperature'] + [f'{obs}_{axis}' for obs in observables for axis in ['x', 'y', 'z']] + \
    [f'orientation_{n}' for n in range(4)]
columns_names = ['timestamp', 'activityID', 'heart rate'] + \
    [f'{part}_{item}' for part in ['hand', 'chest', 'ankle'] for item in sensor]

In [4]:
# we will drop all the orientation variables, check out dataset_readme for further explainations
# timestamp does not have informational value as well
cols_to_drop = [col for col in columns_names if 'orientation' in col] + ['timestamp']

In [5]:
data_dir = '/media/arrteom/3923b309-fef1-47f0-a74c-cd259de5b45b/PAMAP2_Dataset/PAMAP2_Dataset/Protocol'
os.path.exists(data_dir)

True

In [6]:
df_init = None
for file_name in os.listdir(data_dir):
    df = pd.read_csv(os.path.join(data_dir, file_name), sep=' ', header=None, names=columns_names)
    df.drop(columns=cols_to_drop, inplace=True)

    # zeros should be ignored as they are not a distinguishable actions group (dataset_readme.pdf)
    df = df[df['activityID'] != 0]
    
    print(f'file {file_name}, {df.shape = }')

    if df_init is None:
        df_init = df
    else:
        df_init = pd.concat((df_init, df))

print(f'{df_init.shape = }')

file subject106.dat, df.shape = (250096, 41)
file subject104.dat, df.shape = (231421, 41)
file subject102.dat, df.shape = (263349, 41)
file subject105.dat, df.shape = (272442, 41)
file subject103.dat, df.shape = (174338, 41)
file subject107.dat, df.shape = (232776, 41)
file subject109.dat, df.shape = (6391, 41)
file subject101.dat, df.shape = (249957, 41)
file subject108.dat, df.shape = (262102, 41)
df_init.shape = (1942872, 41)


In [8]:
df_init.activityID.value_counts()

activityID
4     238761
17    238690
1     192523
3     189931
7     188107
2     185188
16    175353
6     164600
12    117216
13    104944
5      98199
24     49360
Name: count, dtype: int64

In [9]:
df_init.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1942872 entries, 9588 to 388252
Data columns (total 41 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   activityID              int64  
 1   heart rate              float64
 2   hand_temperature        float64
 3   hand_acceleration16_x   float64
 4   hand_acceleration16_y   float64
 5   hand_acceleration16_z   float64
 6   hand_acceleration6_x    float64
 7   hand_acceleration6_y    float64
 8   hand_acceleration6_z    float64
 9   hand_gyroscope_x        float64
 10  hand_gyroscope_y        float64
 11  hand_gyroscope_z        float64
 12  hand_magnetometer_x     float64
 13  hand_magnetometer_y     float64
 14  hand_magnetometer_z     float64
 15  chest_temperature       float64
 16  chest_acceleration16_x  float64
 17  chest_acceleration16_y  float64
 18  chest_acceleration16_z  float64
 19  chest_acceleration6_x   float64
 20  chest_acceleration6_y   float64
 21  chest_acceleration6_z   float64
 2

In [10]:
os.listdir('/media/arrteom/3923b309-fef1-47f0-a74c-cd259de5b45b/PAMAP2_Dataset/PAMAP2_Dataset/')

['PerformedActivitiesSummary.pdf',
 'readme.pdf',
 'subjectInformation.pdf',
 'Optional',
 'DataCollectionProtocol.pdf',
 'DescriptionOfActivities.pdf',
 'Protocol']

In [11]:
other_dir = '/media/arrteom/3923b309-fef1-47f0-a74c-cd259de5b45b/PAMAP2_Dataset/PAMAP2_Dataset/Optional'

In [13]:
df_opt = None
for file_name in os.listdir(other_dir):
    df = pd.read_csv(os.path.join(other_dir, file_name), sep=' ', header=None, names=columns_names)
    df.drop(columns=cols_to_drop, inplace=True)

    # zeros should be ignored as they are not a distinguishable actions group (dataset_readme.pdf)
    df = df[df['activityID'] != 0]
    
    print(f'file {file_name}, {df.shape = }')

    if df_opt is None:
        df_opt = df
    else:
        df_opt = pd.concat((df_opt, df))

print(f'{df_opt.shape = }')

file subject106.dat, df.shape = (112277, 41)
file subject105.dat, df.shape = (139371, 41)
file subject109.dat, df.shape = (158873, 41)
file subject101.dat, df.shape = (219368, 41)
file subject108.dat, df.shape = (152192, 41)
df_opt.shape = (782081, 41)


In [14]:
df_opt.activityID.value_counts()

activityID
10    309935
19    187188
18     99878
9      83646
11     54519
20     46915
Name: count, dtype: int64

In [15]:
df_opt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 782081 entries, 4268 to 179651
Data columns (total 41 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   activityID              782081 non-null  int64  
 1   heart rate              71419 non-null   float64
 2   hand_temperature        781337 non-null  float64
 3   hand_acceleration16_x   781337 non-null  float64
 4   hand_acceleration16_y   781337 non-null  float64
 5   hand_acceleration16_z   781337 non-null  float64
 6   hand_acceleration6_x    781337 non-null  float64
 7   hand_acceleration6_y    781337 non-null  float64
 8   hand_acceleration6_z    781337 non-null  float64
 9   hand_gyroscope_x        781337 non-null  float64
 10  hand_gyroscope_y        781337 non-null  float64
 11  hand_gyroscope_z        781337 non-null  float64
 12  hand_magnetometer_x     781337 non-null  float64
 13  hand_magnetometer_y     781337 non-null  float64
 14  hand_magnetometer_z   

In [16]:
df_full = pd.concat((df_init, df_opt))
df_full.shape

(2724953, 41)

In [4]:
out_dir = '/media/arrteom/3923b309-fef1-47f0-a74c-cd259de5b45b/PAMAP2_Dataset/PAMAP2_Dataset/'
out_name = 'pamap_data_filtered.csv'

In [19]:
df_full.to_csv(os.path.join(out_dir, out_name), index=False)

In [5]:
df_full = pd.read_csv(os.path.join(out_dir, out_name))

In [7]:
df_small = df_full[df_full['heart rate'].notna()]

In [14]:
counts = pd.DataFrame({'small': df_small.activityID.value_counts(), 'full': df_full.activityID.value_counts()})
counts['diff'] = counts.small / counts.full
counts  # видно, что довольно равномерно насемплили 9% в каждый класс

Unnamed: 0_level_0,small,full,diff
activityID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,28290,309935,0.091277
4,21786,238761,0.091246
17,21770,238690,0.091206
1,17600,192523,0.091418
3,17350,189931,0.091349
7,17198,188107,0.091427
19,17084,187188,0.091267
2,16929,185188,0.091415
16,16028,175353,0.091404
6,15044,164600,0.091397


In [10]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248827 entries, 6 to 2724946
Data columns (total 41 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   activityID              248827 non-null  int64  
 1   heart rate              248827 non-null  float64
 2   hand_temperature        247765 non-null  float64
 3   hand_acceleration16_x   247765 non-null  float64
 4   hand_acceleration16_y   247765 non-null  float64
 5   hand_acceleration16_z   247765 non-null  float64
 6   hand_acceleration6_x    247765 non-null  float64
 7   hand_acceleration6_y    247765 non-null  float64
 8   hand_acceleration6_z    247765 non-null  float64
 9   hand_gyroscope_x        247765 non-null  float64
 10  hand_gyroscope_y        247765 non-null  float64
 11  hand_gyroscope_z        247765 non-null  float64
 12  hand_magnetometer_x     247765 non-null  float64
 13  hand_magnetometer_y     247765 non-null  float64
 14  hand_magnetometer_z     

In [11]:
df_small.to_csv(os.path.join(out_dir, 'pamap_small.csv'), index=False)