In [1]:
import pandas as pd
import os

In [3]:
walking_paths = [
    'archive/Walking-2023-09-16_18-14-40',
    'archive/Walking-2023-09-14_21-51-59'
]

sitting_paths = [
    'archive/Sitting-2023-10-18_09-05-37',
    'archive/Sitting-2023-09-14_09-11-15',
    'archive/Sitting-2023-09-14_08-37-45'
]

cycling_paths = [
    'archive/Cycling-2023-10-18_06-51-26',
    'archive/Cycling-2023-10-18_06-36-17',
    'archive/Cycling-2023-09-16_09-25-09',
    'archive/Cycling-2023-09-16_07-43-07',
    'archive/Cycling-2023-09-14_06-47-00',
    'archive/Cycling-2023-09-14_06-33-47',
    'archive/Cycling-2023-09-14_06-22-31'
]

In [8]:
def merge_data_by_filename(file_name, paths):
    frames = []
    for path in paths:
        data_frames = []
        for root, _, files in os.walk(path):
            for file in files:
                if file == file_name:
                    df = pd.read_csv(os.path.join(root, file))
                    data_frames.append(df)
        if data_frames:
            frames.append(pd.concat(data_frames))
    if frames:
        return pd.concat(frames)
    else:
        return None

In [9]:
paths_combined=walking_paths + sitting_paths + cycling_paths
orientation = merge_data_by_filename('Orientation.csv', paths_combined)
location=merge_data_by_filename('Location.csv', paths_combined)
total_acceleration=merge_data_by_filename('TotalAcceleration.csv', paths_combined)
magnetometer=merge_data_by_filename('Magnetometer.csv',paths_combined)
accelerometer=merge_data_by_filename('Accelerometer.csv',paths_combined)
location_gps=merge_data_by_filename('LocationGps.csv',paths_combined)
gyroscope=merge_data_by_filename('Gyroscope.csv',paths_combined)
pedometer=merge_data_by_filename('Pedometer.csv',paths_combined)
location_network=merge_data_by_filename('LocationNetwork.csv',paths_combined)
gravity=merge_data_by_filename('Gravity.csv',paths_combined)

In [10]:
gravity.head()

Unnamed: 0,time,seconds_elapsed,z,y,x
0,1694888080184831500,0.129832,9.015759,3.805449,0.636441
1,1694888080187351000,0.132351,9.012895,3.81193,0.638207
2,1694888080189870300,0.13487,9.009681,3.819072,0.640881
3,1694888080192389600,0.13739,9.00612,3.826871,0.644398
4,1694888080194909200,0.139909,9.002189,3.835406,0.648565


In [11]:
data_frames = [orientation, location, total_acceleration, magnetometer,
               accelerometer, location_gps, gyroscope, pedometer, location_network, gravity]

parameter_names = ['orientation', 'location', 'total_acceleration', 'magnetometer',
                   'accelerometer', 'location_gps', 'gyroscope', 'pedometer', 'location_network', 'gravity']

data_dict = {name: df for name, df in zip(parameter_names, data_frames)}

In [12]:
# Info about data and searching for missing values
for parameter, parameter_name in zip(data_frames, parameter_names):
    print("Info for parameter:", parameter_name)
    print(parameter.info())
    print("\nMissing values:")
    print(parameter.isnull().sum())
    print("\n")

Info for parameter: orientation
<class 'pandas.core.frame.DataFrame'>
Index: 2775510 entries, 0 to 71460
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   time             int64  
 1   seconds_elapsed  float64
 2   qz               float64
 3   qy               float64
 4   qx               float64
 5   qw               float64
 6   roll             float64
 7   pitch            float64
 8   yaw              float64
dtypes: float64(8), int64(1)
memory usage: 211.8 MB
None

Missing values:
time               0
seconds_elapsed    0
qz                 0
qy                 0
qx                 0
qw                 0
roll               0
pitch              2
yaw                0
dtype: int64


Info for parameter: location
<class 'pandas.core.frame.DataFrame'>
Index: 8980 entries, 0 to 229
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   time                89

In [13]:
for parameter, parameter_name in zip(data_frames, parameter_names):
    print(parameter_name)
    print(parameter.head())

orientation
                  time  seconds_elapsed        qz        qy        qx  \
0  1694888080184831500         0.129832 -0.495947 -0.129112  0.153800   
1  1694888080187351000         0.132351 -0.496316 -0.129446  0.153994   
2  1694888080189870300         0.134870 -0.496670 -0.129838  0.154196   
3  1694888080192389600         0.137390 -0.497005 -0.130284  0.154409   
4  1694888080194909200         0.139909 -0.497314 -0.130776  0.154643   

         qw      roll     pitch       yaw  
0  0.844815 -0.071233 -0.398385  1.047297  
1  0.844512 -0.071450 -0.399103  1.048190  
2  0.844206 -0.071771 -0.399894  1.049035  
3  0.843902 -0.072187 -0.400758  1.049822  
4  0.843601 -0.072679 -0.401704  1.050542  
location
                  time  seconds_elapsed  bearingAccuracy  speedAccuracy  \
0  1694888080756000000            0.701       179.899994            4.5   
1  1694888081256000000            1.201       126.300003            8.4   
2  1694888081358000000            1.303         0.0

In [28]:
all_times=pd.concat([df['time'] for df in data_frames]).unique()
merged_df=pd.DataFrame({'time':all_times})

In [29]:
for name, df in data_dict.items():
    suffixed_columns={col: f'{col}_{name}' for col in df.columns if col!='time'}
    df_renamed=df.rename(columns=suffixed_columns)
    merged_df=pd.merge(merged_df, df_renamed, on='time',how='left')
merged_df.head() 

Unnamed: 0,time,seconds_elapsed_orientation,qz_orientation,qy_orientation,qx_orientation,qw_orientation,roll_orientation,pitch_orientation,yaw_orientation,seconds_elapsed_location,...,horizontalAccuracy_location_network,speed_location_network,bearing_location_network,altitude_location_network,longitude_location_network,latitude_location_network,seconds_elapsed_gravity,z_gravity,y_gravity,x_gravity
0,1694888080184831500,0.129832,-0.495947,-0.129112,0.1538,0.844815,-0.071233,-0.398385,1.047297,,...,,,,,,,0.129832,9.015759,3.805449,0.636441
1,1694888080187351000,0.132351,-0.496316,-0.129446,0.153994,0.844512,-0.07145,-0.399103,1.04819,,...,,,,,,,0.132351,9.012895,3.81193,0.638207
2,1694888080189870300,0.13487,-0.49667,-0.129838,0.154196,0.844206,-0.071771,-0.399894,1.049035,,...,,,,,,,0.13487,9.009681,3.819072,0.640881
3,1694888080192389600,0.13739,-0.497005,-0.130284,0.154409,0.843902,-0.072187,-0.400758,1.049822,,...,,,,,,,0.13739,9.00612,3.826871,0.644398
4,1694888080194909200,0.139909,-0.497314,-0.130776,0.154643,0.843601,-0.072679,-0.401704,1.050542,,...,,,,,,,0.139909,9.002189,3.835406,0.648565


In [30]:
merged_df_sorted = merged_df.sort_values(by='time')
merged_df_sorted.reset_index(drop=True, inplace=True)

In [31]:
merged_df_filled = merged_df_sorted.fillna(method='ffill')
merged_df_filled.head()

  merged_df_filled = merged_df_sorted.fillna(method='ffill')


Unnamed: 0,time,seconds_elapsed_orientation,qz_orientation,qy_orientation,qx_orientation,qw_orientation,roll_orientation,pitch_orientation,yaw_orientation,seconds_elapsed_location,...,horizontalAccuracy_location_network,speed_location_network,bearing_location_network,altitude_location_network,longitude_location_network,latitude_location_network,seconds_elapsed_gravity,z_gravity,y_gravity,x_gravity
0,1694672543371911400,,,,,,,,,,...,,,,,,,,,,
1,1694672551540493300,,,,,,,,,,...,,,,,,,,,,
2,1694672551553178000,,,,,,,,,,...,,,,,,,,,,
3,1694672551555605000,,,,,,,,,,...,,,,,,,,,,
4,1694672551570763500,,,,,,,,,,...,,,,,,,,,,


In [32]:
for column in merged_df_filled.columns:
    first_valid_index = merged_df_filled[column].first_valid_index()
    print(f"The first non-empty record index for column '{column}': {first_valid_index}")

The first non-empty record index for column 'time': 0
The first non-empty record index for column 'seconds_elapsed_orientation': 6
The first non-empty record index for column 'qz_orientation': 6
The first non-empty record index for column 'qy_orientation': 6
The first non-empty record index for column 'qx_orientation': 6
The first non-empty record index for column 'qw_orientation': 6
The first non-empty record index for column 'roll_orientation': 6
The first non-empty record index for column 'pitch_orientation': 6
The first non-empty record index for column 'yaw_orientation': 6
The first non-empty record index for column 'seconds_elapsed_location': 598
The first non-empty record index for column 'bearingAccuracy_location': 598
The first non-empty record index for column 'speedAccuracy_location': 598
The first non-empty record index for column 'verticalAccuracy_location': 598
The first non-empty record index for column 'horizontalAccuracy_location': 598
The first non-empty record index 