# Data Prep

In [1]:
# !pip install pandas
import pandas as pd
import numpy as np

## Function for data adjustments

In [4]:
def add_time(dataframe):
    time = 0
    for i, row in dataframe.iterrows():
        dataframe.at[i,"timestamp"] = int(time)
        time += 3
    dataframe["timestamp"] = dataframe["timestamp"].astype(int)
    return dataframe




def data_prep(core):
    ## IMU DATASET
    file_in = core + "/" + core + "_IMU.csv"
    data_imu = pd.read_csv(file_in, header=None)
    
    data_imu.columns = ['timestamp','AccelerometerX', 'AccelerometerY', 'AccelerometerZ']
    data_imu['multiplied'] = data_imu['AccelerometerX']*data_imu['AccelerometerY']*data_imu['AccelerometerZ']
    data_imu.insert(5, 'type', 'imu')
    data_imu["Session"] = core 
    
    file_out = "final_stream_v2/" + core + "_imu_final.csv"
    data_imu.to_csv(file_out, header = 1)


    
    ## GPS RAW DATASET
    file_in = core + "/" + core + "_GPS.csv"
    data_gps = pd.read_csv(file_in, header = None)
    
    data_gps.columns = ['timestamp_ms','Latitude', 'Longitude', 'Altitude']
    data_gps.insert(1, 'timestamp', data_gps['timestamp_ms']/1000)
    data_gps.insert(5, 'type', 'gps_raw')
    data_gps["Session"] = core 
    
    timestamp_of_first_gps_signal = data_gps.loc[(data_gps['Latitude']!=0.0) 
                                                 & (data_gps['Longitude']!=0.0)
                                                 ,'timestamp'].iloc[0]

    file_out = "final_stream_v2/" + core + "_gps_final.csv"
    data_gps.to_csv(file_out, header = 1)
    
    
    
    ## ECG_BPM DATASET
    file_in = core + "/" + core + "_ECG_bpm_rolling_aggregated_smooth.csv"
    data_ecg_bpm = pd.read_csv(file_in, header = None)
    
    data_ecg_bpm.columns = ['BPM']
    data_ecg_bpm = add_time(data_ecg_bpm)
    data_ecg_bpm.insert(2, 'type', 'bpm')
    data_ecg_bpm["Session"] = core 

    
    file_out = "final_stream_v2/" + core + "_ecg_bpm_final.csv"
    data_ecg_bpm.to_csv(file_out, header = 1)
    #merge data bpm into hrv!!!!! 
    
    
    ## ECG_HRV DATASET
    file_in = core + "/" + core + "_ECG_hrv_rolling_aggregated_smooth.csv"
    data_ecg_hrv = pd.read_csv(file_in, header = None)
    data_ecg_bpm = pd.read_csv(file_in, header = None)
    
    data_ecg_hrv_bpm = pd.concat([data_ecg_hrv, data_ecg_bpm], axis=1)
    data_ecg_hrv_bpm.columns = ['HRV', 'BPM']
    #data_ecg_hrv.columns = ['HRV']
    data_ecg_hrv_bpm = add_time(data_ecg_hrv_bpm)
    data_ecg_hrv_bpm["hrv_rolling_mean"] = data_ecg_hrv_bpm["HRV"].rolling(10, center=True).mean()
    
    data_ecg_hrv_bpm.insert(4, 'type', 'hrv_bpm')
    data_ecg_hrv_bpm["Session"] = core
    
    file_out = "final_stream_v2/" + core + "_ecg_hrv_bpm_final.csv"
    data_ecg_hrv_bpm.to_csv(file_out, header = 1)
    
    
    ## ECG_BR DATASET
    file_in = core + "/" + core + "_ECG_br_rolling_aggregated_smooth.csv"
    data_ecg_br = pd.read_csv(file_in, header = None)
    
    data_ecg_br.columns = ['BR']
    data_ecg_br = add_time(data_ecg_br)
    data_ecg_br.insert(2, 'type', 'br')
    data_ecg_br["Session"] = core 
    
    file_out = "final_stream_v2/" + core + "_ecg_br_final.csv"
    data_ecg_br.to_csv(file_out, header = 1)
    
    
    
    ## ECG_IBI DATASET    
    file_in = core + "/" + core + "_ECG_ibi_rolling_aggregated_smooth.csv"
    data_ecg_ibi = pd.read_csv(file_in, header = None)
    
    data_ecg_ibi.columns = ['IBI']
    data_ecg_ibi = add_time(data_ecg_ibi)
    data_ecg_ibi.insert(2, 'type', 'ibi')
    data_ecg_ibi["Session"] = core
    
    file_out = "final_stream_v2/" + core + "_ecg_ibi_final.csv"
    data_ecg_ibi.to_csv(file_out, header = 1)
   
    
    
    ## GPS DATASETS TO MERGE
    data_altitude = pd.read_csv(core + "/" + core + "_GPS_altitude_change.csv", header = None)
    data_distance = pd.read_csv(core + "/" + core + "_GPS_distance.csv", header = None)
    data_gradients = pd.read_csv(core + "/" + core + "_GPS_gradients.csv", header = None)
    data_speed = pd.read_csv(core + "/" + core + "_GPS_smooth_speed.csv", header = None)
    data_time = pd.read_csv(core + "/" + core + "_GPS_time_accumulation.csv", header = None)
    
    data_gps_merged = pd.concat([data_time, data_speed, data_gradients, data_distance, data_altitude], axis=1)
    data_gps_merged.columns = ['time difference', 'speed', 'gradients',
                         'distance', 'altitude_change']

    data_gps_merged['distance'] = pd.to_numeric(data_gps_merged['distance'])
    data_gps_merged['_time difference'] = data_gps_merged['time difference']
    data_gps_merged['_distance'] = data_gps_merged['distance']

    first_index_distance_not_zero = data_gps_merged[data_gps_merged.distance != 0].index[0]

    data_gps_merged.loc[first_index_distance_not_zero -1,'_time difference'] = timestamp_of_first_gps_signal
    data_gps_merged.loc[first_index_distance_not_zero -1,'_distance'] = 99
    
    data_gps_merged.iloc[-1]._distance = 99

    data_gps_merged.loc[data_gps_merged['_distance'] == 0, '_time difference'] = 0

    data_gps_merged.insert(1, 'timestamp', data_gps_merged['_time difference'].cumsum())
    
    data_gps_merged.insert(8, 'type', 'gps_processed')
    data_gps_merged["Session"] = core
    
    data_gps_merged["speed_rolling_mean"] = data_gps_merged["speed"].rolling(10, center=True).mean()
    data_gps_merged["speed_rolling_mean"] = data_gps_merged["speed_rolling_mean"].fillna(0)

    data_gps_merged["not_moving"] = np.where(data_gps_merged.speed_rolling_mean < 1.5, 1 , 0 )


    
    
    sum_ac = data_gps_merged.loc[data_gps_merged['_distance'] != 0, '_time difference'].sum()
    print("sum of timedistance: ", sum_ac)
    
    print("last timestamp gps calculated: ", max(data_gps_merged["timestamp"]))
    print("last timestamp gps original: ", max(data_gps["timestamp"]))
    
    file_out = "final_stream_v2/" + core + "_gps_merged_final.csv"
    data_gps_merged.to_csv(file_out, header = 1)
    
    
    
    print(core +" done")
    
    return data_imu, data_gps, data_gps_merged, data_ecg_bpm, data_ecg_br, data_ecg_hrv, data_ecg_ibi

## Run for all cores

In [5]:
# biathlon cores
#Core_315_1573, Core_315_1884, Core_315_284, Core_315_999

cores = ['Core_315_284', 'Core_315_999','Core_315_1573', 'Core_315_1884']

for core in cores:
    data_prep(core)

sum of timedistance:  7307.695
last timestamp gps calculated:  7307.695000000056
last timestamp gps original:  7307.695
Core_315_284 done
sum of timedistance:  8172.865
last timestamp gps calculated:  8172.8649999998515
last timestamp gps original:  8172.865
Core_315_999 done
sum of timedistance:  1840.9449999999997
last timestamp gps calculated:  1840.9450000000336
last timestamp gps original:  1840.945
Core_315_1573 done
sum of timedistance:  3570.8149999999996
last timestamp gps calculated:  3570.814999999978
last timestamp gps original:  3570.815
Core_315_1884 done


In [9]:
core = 'Core_315_185'
data_prep(core)


sum of timedistance:  7114.105
last timestamp gps calculated:  7114.105000000038
last timestamp gps original:  7114.105
Core_315_185 done


(         timestamp  AccelerometerX  AccelerometerY  AccelerometerZ  \
 0          10177.0       -0.519043       -0.139648        0.359375   
 1          10177.0       -0.833008       -0.007812        0.459473   
 2          10177.0       -0.966797        0.061035        0.463867   
 3          10178.0       -1.077148        0.116699        0.436523   
 4          10178.0       -1.034668        0.119629        0.439941   
 ...            ...             ...             ...             ...   
 1617716  7114265.0       -0.747559        0.056641        0.630371   
 1617717  7114270.0       -0.771973        0.087891        0.630859   
 1617718  7114274.0       -0.792969        0.092773        0.652344   
 1617719  7114278.0       -0.780762        0.096191        0.626465   
 1617720  7114283.0       -0.794922        0.122070        0.638672   
 
          multiplied type       Session  
 0          0.026049  imu  Core_315_185  
 1          0.002990  imu  Core_315_185  
 2         -0.027372