In [2]:
import pandas as pd

In [4]:
# Read dataset
df = pd.read_csv('../data/raw/combined_dataset.csv')


In [5]:
df

Unnamed: 0,timestamp,x,y,z,subject,category,activity,sensor,trial
0,2318482693000,0.890643,-9.576807,-0.019154,sub1,FALLS,BSC,acc,1
1,2318492108000,0.919373,-9.595961,-0.057461,sub1,FALLS,BSC,acc,1
2,2318503365000,0.871489,-9.595961,-0.057461,sub1,FALLS,BSC,acc,1
3,2318512831000,0.852336,-9.643845,-0.038307,sub1,FALLS,BSC,acc,1
4,2318523222000,0.871489,-9.634268,-0.057461,sub1,FALLS,BSC,acc,1
...,...,...,...,...,...,...,...,...,...
5804821,759949519000,258.757500,6.500066,-47.713290,sub9,FALLS,SDL,ori,3
5804822,759954457000,258.748470,6.498722,-47.701267,sub9,FALLS,SDL,ori,3
5804823,759959490000,258.736420,6.492827,-47.690780,sub9,FALLS,SDL,ori,3
5804824,759964438000,258.726380,6.492289,-47.682583,sub9,FALLS,SDL,ori,3


In [None]:
# Select only accelerometer and gyrometer
new_df = df[df['sensor'] != 'ori']

df['subject'].nunique()

In [None]:
# Split into Acceleration and Gyroscope data
acc_df = df[df['sensor'] == 'acc'].drop(columns=['sensor'])
gyro_df = df[df['sensor'] == 'gyro'].drop(columns=['sensor'])

In [None]:
# Ensure the timestamp is in datetime format
acc_df['timestamp'] = pd.to_datetime(acc_df['timestamp'], unit = 'ns')

gyro_df['timestamp'] = pd.to_datetime(gyro_df['timestamp'],  unit = 'ns')

In [None]:
# Index the timestamp
acc_df.index = acc_df['timestamp'] 
gyro_df.index = gyro_df['timestamp'] 

In [None]:
# Delete the timestamp columns
del acc_df['timestamp']
del gyro_df['timestamp']

In [None]:
# Merge: Keep all gyro timestamps, add acc data
df_merged = gyro_df.merge(acc_df.iloc[:,:3], how="left", left_index=True, right_index=True)
df_merged.interpolate(method="linear", inplace=True)

Here we have a lot of missing accelerometer data beccause gyrometer is 
measured at a higher rate than accelerometer and the chances of it measuring 
at the same exact ms is low. We pick the gyro data to sync accelerometer because it measures with the highest frequency. The empty accelerometer data will be linearly interpolated

In [None]:
df_merged = df_merged.dropna() # Drop NA

In [None]:
# Rename columns
df_merged.columns = ['gyr_x', 'gyr_y', 'gyr_z', 'subject', 'category', 'activity', 'trial', 'acc_x', 'acc_y', 'acc_z']

# 2.3 million row remain after llinear interppolation


In [None]:
# Creating the subject DataFrame from Mobifall description
subject_data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 29, 30, 31],
    "age": [32, 26, 26, 32, 36, 22, 25, 22, 30, 26, 26, 29, 24, 24, 25, 27, 25, 25, 25, 26, 25, 27, 47, 27],
    "height": [180, 169, 164, 186, 160, 172, 189, 183, 177, 170, 168, 178, 177, 178, 173, 172, 173, 176, 161, 178, 180, 186, 172, 170],
    "weight": [85, 64, 55, 93, 50, 62, 80, 93, 102, 90, 80, 83, 62, 85, 82, 56, 67, 73, 63, 71, 70, 103, 90, 75],
    "gender": ["M", "M", "F", "M", "F", "F", "M", "M", "M", "F", "F", "M", "M", "M", "M", "F", "M", "M", "F", "M", "M", "M", "M", "M"]
}

df = pd.DataFrame(subject_data)

# Modifying the id column to add "sub_" prefix
df["id"] = "sub" + df["id"].astype(str)

df.rename(columns={'id': 'subject'}, inplace = True)

In [None]:
# Add subjects detail
df_merged = df_merged.merge(df, on = 'subject')


In [None]:
# Select only relevant columns
mobifall_df = df_merged[['acc_x','acc_y','acc_z','gyr_x','gyr_y','gyr_z','age','height','weight','gender', 'category']]


In [None]:
# Export to csv
mobifall_df.to_csv('../../data/interim/mobifall_df.csv', index = False)