In [1]:
import pandas as pd
import os
import random
from tqdm import tqdm
import pickle
from sklearn.preprocessing import MinMaxScaler



In [2]:
regenerate_context_data = True
sequence_augmentation = True
whole_session_context = False
model_test_run = False
data_autoencoder = False
pad_to_window_size = True

PATH_TO_LOAD = '../data/04_Merged'
combined_context_path = '../data/05_Interaction_Sequences/context.csv'

window = 100 #seconds

base_path = '../datasets/sequential/'
augmentation_folder = 'aug/' if sequence_augmentation else 'non_aug/'
if model_test_run:
    augmentation_folder = 'test/aug/' if sequence_augmentation else 'test/non_aug/'

sequence_context_path = f'{base_path}{augmentation_folder}parameters/sequence_context.csv'
parameter_path = f'{base_path}{augmentation_folder}parameters'
train_session_path = f'{base_path}{augmentation_folder}parameters/train_sessions.pkl'
test_session_path = f'{base_path}{augmentation_folder}parameters/test_sessions.pkl'
train_dynamic_context_path = f'{base_path}{augmentation_folder}dynamic_context/train.csv'
test_dynamic_context_path = f'{base_path}{augmentation_folder}dynamic_context/test.csv'
train_static_context_path = f'{base_path}{augmentation_folder}static_context/train.csv'
test_static_context_path = f'{base_path}{augmentation_folder}static_context/test.csv'

In [3]:
all_columns = ['index', 'avg_irradiation', 'steering_speed', 'temperature_out', 'hour',
       'month', 'odometer', 'light_sensor_rear', 'light_sensor_front',
       'temperature_in', 'KBI_speed', 'soc', 'ESP_speed', 'latitude',
       'longitude', 'seatbelt_codriver', 'seatbelt_rear_l', 'seatbelt_rear_m',
       'seatbelt_rear_r', 'CHA_ESP_drive_mode', 'CHA_MO_drive_mode',
       'rain_sensor', 'street_category', 'kickdown', 'altitude',
       'driving_program', 'datetime', 'session', 'Label', 'ID',
       'FunctionValue', 'domain', 'BeginTime', 'time_second',
       'distance_driven', 'ts_normalized', 'weekday']

selected = [ 'avg_irradiation', 'steering_speed', 'temperature_out', 'hour',
       'month', 'light_sensor_rear', 'light_sensor_front',
       'temperature_in', 'KBI_speed', 'soc', 'latitude',
       'longitude', 'seatbelt_codriver', 'seatbelt_rear_l',
       'seatbelt_rear_r', 'street_category', 'altitude',
       'datetime', 'session', 'time_second',
       'distance_driven', 'weekday'
]

bad_quality = ['CHA_ESP_drive_mode', 
             'CHA_MO_drive_mode',
             'rain_sensor',
             'kickdown',
             'ESP_speed',
             'seatbelt_rear_m',
            'driving_program',
            'ts_normalized'
             ]

dynamic_context_var = ['avg_irradiation', 'steering_speed', 'temperature_out', 
                       'light_sensor_rear', 'light_sensor_front', 
                       'temperature_in', 'KBI_speed', 'soc', 'latitude',
                       'longitude',  'street_category', 'altitude','time_second',
                       'distance_driven']
static_context_var = ['car_id', 'month', 'weekday', 'hour', 'seatbelt_codriver', 'seatbelt_rear_l',
                       'seatbelt_rear_r',]
#Todo take average of these value over a window
constant_context_var = ['avg_irradiation','temperature_out'] #to be filled

In [4]:
def load_context(vehicle):
    context_lists = dynamic_context_var + static_context_var + ['session', 'datetime', 'Label']
    context_lists.remove('car_id')
    df = pd.read_csv(os.path.join(PATH_TO_LOAD, vehicle + "_merged.csv"), parse_dates=['datetime'], usecols=context_lists, low_memory=False)
    df = df.sort_values(by=['session','datetime'])
    return df

vehicles = ['SEB880','SEB882','SEB883','SEB885','SEB888','SEB889']
context_data_list = []
if regenerate_context_data == True:
    for vehicle in tqdm(vehicles):
        context_curr = load_context(vehicle)
        context_curr['car_id'] = vehicle
        context_data_list.append(context_curr)
    context_data = pd.concat(context_data_list, axis=0)
    context_data.to_csv(combined_context_path)
else:
    context_data = pd.read_csv(combined_context_path, parse_dates=['datetime'], index_col=0)

100%|██████████| 6/6 [00:53<00:00,  8.87s/it]


In [5]:
context_data['Label'] = context_data['Label'].replace('car/driveMode/0', 'car/driveMode/0.0')
context_data['Label'] = context_data['Label'].replace('car/driveMode/2', 'car/driveMode/2.0')
context_data['Label'] = context_data['Label'].replace('car/driveMode/3', 'car/driveMode/3.0')

# context_data['Label'] = context_data['Label'].replace('car/charismaLevel/Abgesenkt', 'car/charismaLevel/change')
# context_data['Label'] = context_data['Label'].replace('car/charismaLevel/Lift', 'car/charismaLevel/change')
# context_data['Label'] = context_data['Label'].replace('car/charismaLevel/Mittel', 'car/charismaLevel/change')
# context_data['Label'] = context_data['Label'].replace('car/charismaLevel/Tief', 'car/charismaLevel/change')

In [6]:
context_data.columns

Index(['avg_irradiation', 'steering_speed', 'temperature_out', 'hour', 'month',
       'light_sensor_rear', 'light_sensor_front', 'temperature_in',
       'KBI_speed', 'soc', 'latitude', 'longitude', 'seatbelt_codriver',
       'seatbelt_rear_l', 'seatbelt_rear_r', 'street_category', 'altitude',
       'datetime', 'session', 'Label', 'time_second', 'distance_driven',
       'weekday', 'car_id'],
      dtype='object')

In [7]:
cleaned_list = [x for x in context_data.Label.unique().tolist() if not pd.isna(x)]
sorted(cleaned_list)

['car/ESS/on',
 'car/Start/ParkAssistant',
 'car/charismaLevel/Abgesenkt',
 'car/charismaLevel/Lift',
 'car/charismaLevel/Mittel',
 'car/charismaLevel/Tief',
 'car/driveMode/0.0',
 'car/driveMode/1.0',
 'car/driveMode/2.0',
 'car/driveMode/3.0',
 'clima/AC/ECO',
 'clima/AC/off',
 'clima/AC/on',
 'media/selectedSource/Bluetooth',
 'media/selectedSource/CarPlay',
 'media/selectedSource/Favorite',
 'media/selectedSource/Radio',
 'navi/Start/Address',
 'navi/Start/Favorite',
 'phone/Call/Favorite',
 'phone/Call/PersonX',
 'phone/Connect/NewDevice',
 'phone/Start/AndroidAuto',
 'phone/Start/CarPlay',
 'phone/goTo/Favorite']

In [None]:
#TODO ess status - Untill the point ess is swtiched on, add a status that ess is off now - 1 on/ 0 off
# current drive mode - 0, 1, 2, 3
# current AC mode - Eco, on, off
# current selected source - bluetooth, carplay, favourite, radio, no source
# navigation guidance activation status - 0 - inactive/ 1 - active
# phone call status - 
# os of the phone connected
# new bluetooth device around status - 1 - yes and 0- no
# phone connected or not -  (needed to make a phone call)


In [9]:
context_data[context_data['Label']=='car/ESS/on']

Unnamed: 0,avg_irradiation,steering_speed,temperature_out,hour,month,light_sensor_rear,light_sensor_front,temperature_in,KBI_speed,soc,...,seatbelt_rear_r,street_category,altitude,datetime,session,Label,time_second,distance_driven,weekday,car_id
146814,0.0,0.0,12.0,20,12,0.0,0.0,21.5,22.037324,71.9,...,1.0,0.0,194.0,2022-12-29 20:23:44,205.0,car/ESS/on,870.0,14728.0,3,SEB880
147029,0.0,0.0,12.0,20,12,0.0,0.0,22.0,0.0,71.6,...,1.0,7.0,198.0,2022-12-29 20:27:19,205.0,car/ESS/on,1085.0,14729.0,3,SEB880
1690173,6.419186,0.0,9.5,10,12,4800.0,1350.0,26.0,0.0,91.35,...,0.0,7.0,474.0,2022-12-22 10:31:00,1659.0,car/ESS/on,9614.0,28353.0,3,SEB883
2279768,0.0,0.0,-3.0,9,2,0.0,72.0,22.5,0.0,69.5,...,0.0,7.0,338.0,2023-02-07 09:05:58,1908.0,car/ESS/on,7219.0,34029.0,1,SEB883
2519286,0.266636,0.0,0.0,13,2,14398.797565,6041.855708,23.181415,0.0,68.087081,...,0.0,0.0,468.0,2023-02-28 13:49:30,1997.0,car/ESS/on,22299.0,36521.0,1,SEB883
348153,60.0,5.029246,14.5,10,10,17735.744925,5208.488347,28.0,51.2,81.5,...,0.0,7.0,694.0,2022-10-21 10:56:37,2587.0,car/ESS/on,18495.0,10472.0,4,SEB885
348164,50.0,0.0,14.5,10,10,18000.0,5890.053078,28.0,58.24,81.5,...,0.0,4.0,688.0,2022-10-21 10:56:48,2587.0,car/ESS/on,18506.0,10472.0,4,SEB885
351581,70.0,0.0,13.0,11,10,22000.0,4650.0,28.0,83.84,56.4,...,0.0,5.0,931.706949,2022-10-21 11:53:44,2587.0,car/ESS/on,21922.0,10565.0,4,SEB885
1165732,10.0,0.0,2.0,17,2,5200.0,2153.365494,6.5,67.203988,94.4,...,0.0,2.0,263.814125,2023-02-26 17:30:43,5256.0,car/ESS/on,270.0,4.0,6,SEB889


In [45]:
## adding the current status of ess as a context variable.
ess_info = []
for session in tqdm(context_data.session.unique().tolist()):
    df = context_data[context_data['session']== session].copy()
    df['ess_status'] = 0
    ess_index = df.index[df['Label'] == 'car/ESS/on']
    for idx in ess_index:
        df.loc[:idx, 'ess_status'] = 0  # Set 'ess_status' to 0 for rows before the 'car/ESS/on' row
        df.loc[idx:, 'ess_status'] = 1
    ess_info.append(df)
data = pd.concat(ess_info, axis=0)

100%|██████████| 1704/1704 [00:19<00:00, 86.12it/s] 


In [46]:
data

Unnamed: 0,avg_irradiation,steering_speed,temperature_out,hour,month,light_sensor_rear,light_sensor_front,temperature_in,KBI_speed,soc,...,street_category,altitude,datetime,session,Label,time_second,distance_driven,weekday,car_id,ess_status
0,0.000000,0.000000,0.0,20,9,0.0,0.000000,0.0,0.00,0.00,...,0.0,0.000000,2022-09-07 20:08:33,7.0,,0.0,0.0,2,SEB880,0
1,0.000000,0.000000,0.0,20,9,0.0,0.000000,0.0,0.00,48.00,...,7.0,0.000000,2022-09-07 20:08:34,7.0,,1.0,10845.0,2,SEB880,0
2,0.000000,0.000000,25.0,20,9,0.0,0.000000,0.0,0.00,48.00,...,7.0,0.000000,2022-09-07 20:08:35,7.0,,2.0,10845.0,2,SEB880,0
3,0.000000,0.000000,25.0,20,9,0.0,0.000000,27.0,0.00,48.00,...,7.0,0.000000,2022-09-07 20:08:36,7.0,,3.0,10845.0,2,SEB880,0
4,0.000000,0.000000,25.0,20,9,0.0,0.000000,27.0,0.00,48.00,...,7.0,0.000000,2022-09-07 20:08:36,7.0,,3.0,10845.0,2,SEB880,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1360504,53.755324,0.000000,13.5,17,3,17600.0,6126.000000,28.5,92.48,22.25,...,4.0,474.000000,2023-03-31 17:09:13,5399.0,,5360.0,146.0,4,SEB889,0
1360505,50.000000,0.000000,13.5,17,3,17200.0,5920.781157,28.5,93.12,22.20,...,7.0,474.000000,2023-03-31 17:09:14,5399.0,,5361.0,146.0,4,SEB889,0
1360506,56.226313,15.000000,13.5,17,3,18000.0,5699.908827,28.5,93.44,22.20,...,4.0,474.000000,2023-03-31 17:09:15,5399.0,,5362.0,146.0,4,SEB889,0
1360507,60.000000,0.000000,13.5,17,3,17600.0,5437.218229,28.5,93.76,22.20,...,4.0,472.468149,2023-03-31 17:09:16,5399.0,,5363.0,146.0,4,SEB889,0


In [40]:
data[data['session']==205][['Label', 'session']].dropna()

Unnamed: 0,Label,session
146069,media/selectedSource/Radio,205.0
146188,car/driveMode/1.0,205.0
146604,car/charismaLevel/Lift,205.0
146617,car/charismaLevel/Abgesenkt,205.0
146814,car/ESS/on,205.0
147029,car/ESS/on,205.0
147105,navi/Start/Address,205.0
147337,car/driveMode/0.0,205.0
147375,media/selectedSource/Radio,205.0
147549,media/selectedSource/Bluetooth,205.0


In [36]:
data.session.unique()

array([   7.,   16.,   20., ..., 5388., 5389., 5399.])

In [47]:
test_session = [5399]

drive_mode_info = []
# for session in tqdm(data.session.unique().tolist()):
for session in tqdm(test_session):
    df = context_data[context_data['session']== session].copy()
    df = df.sort_values(by=['datetime'])
    df['current_drive_mode'] = 'car/driveMode/0.0'
    drive_modes = {'car/driveMode/0.0', 'car/driveMode/1.0', 'car/driveMode/2.0', 'car/driveMode/3.0'}

    # Iterate over DataFrame
    for i, row in df.iterrows():
        label = row['Label']
        if label in drive_modes:
            df.at[i, 'current_drive_mode'] = label
        #     # Update subsequent rows until a new value appears
        #     j = i + 1
        #     while j < len(df) and df.at[j, 'Label'] in drive_modes:
        #         df.at[j, 'current_drive_mode'] = label
        #         j += 1
        # else:
        #     # For rows before this row, set 'car/driveMode/1.0' if the value in the current row is none of these
        #     if row['Label']
        #         df.loc[:i, 'current_drive_mode'] = 'car/driveMode/0.0'

    drive_mode_info.append(df)
data = pd.concat(drive_mode_info, axis=0)
data[['Label', 'session', 'current_drive_mode']].to_csv('check.csv')

100%|██████████| 1/1 [00:00<00:00,  7.74it/s]


In [None]:
vehicle_list = context_data.car_id.unique().tolist()
random.shuffle(vehicle_list)
vehicle_dict = {}
used_values = set()
for vehicle in vehicle_list:
    value = random.randint(1, len(vehicle_list))
    while value in used_values:
        value = random.randint(1, len(vehicle_list))
    vehicle_dict[vehicle] = value
    used_values.add(value)

context_data['car_id'] = context_data['car_id'].map(vehicle_dict)
context_data = context_data.sort_values(by=['session','datetime'])

context_data['session'] = context_data['session'].astype(int)
context_data = context_data[dynamic_context_var + static_context_var + ['session', 'datetime']]
context_data = context_data.groupby(['session', 'datetime'])[dynamic_context_var + static_context_var].mean().reset_index()

with open(train_session_path, 'rb') as pickle_file:
    train_sessions = pickle.load(pickle_file)
with open(test_session_path, 'rb') as pickle_file:
    test_sessions = pickle.load(pickle_file)
context_data = context_data[context_data['session'].isin(train_sessions + test_sessions)]

selected_sequence = pd.read_csv(sequence_context_path, parse_dates=['datetime'], index_col=0).reset_index()
selected_sequence['session'] = selected_sequence['session'].astype(int)
min_datetime_indices = selected_sequence.groupby('session')['datetime'].idxmin()
selected_sequence = selected_sequence.drop(min_datetime_indices)
selected_sequence.reset_index(drop=True, inplace=True)

selected_dfs = []
for session in tqdm(selected_sequence['session'].unique().tolist()):
    selected_sequence_curr = selected_sequence[selected_sequence['session']==session]
    context_data_curr = context_data[context_data['session']==session]
    context_data_curr = context_data_curr[context_data_curr['datetime']<=selected_sequence_curr['datetime'].max()]
    selected_dfs.append(context_data_curr)
training_sequence_context = pd.concat(selected_dfs, axis=0)

In [None]:
# testing_sessions = [16, 25]
# selected_sequence = selected_sequence[selected_sequence['session'].isin(testing_sessions)]
# training_sequence_context = training_sequence_context[training_sequence_context['session'].isin(testing_sessions)]

window_id = 0
if sequence_augmentation == True:
    grouped_selected_sequence = selected_sequence.groupby('session')
    augmented_frames = []
    for session, selected_sequence_curr in tqdm(grouped_selected_sequence):
        for i, row in selected_sequence_curr.iloc[::-1].iterrows():
            context_filt_curr = training_sequence_context[
                (training_sequence_context['session'] == session) &
                (training_sequence_context['datetime'] < row['datetime'])
            ].copy()
            if not whole_session_context:
                context_filt_curr = context_filt_curr.tail(window)
            context_filt_curr['window_id'] = window_id
            # context_filt_curr['session'] = session
            augmented_frames.append(context_filt_curr)
            window_id += 1
    training_sequence_context_augmented = pd.concat(augmented_frames, axis=0)
    context_data = training_sequence_context_augmented.reset_index(drop=True)
else:
    # if sequence_augmentation is set to false
    if not whole_session_context:
        context_data = training_sequence_context.groupby('session').tail(window)
    context_data = training_sequence_context.reset_index(drop=True)
    context_data['window_id'] = context_data.groupby('session').ngroup()

In [None]:
dynamic_context = context_data[dynamic_context_var + ['window_id', 'session', 'datetime']]
print('number of dynamic context session', len(dynamic_context.window_id.unique().tolist()))

# function to pad first value to fit the window size
if pad_to_window_size:
    df = dynamic_context.copy()
    session_counts = df.groupby('window_id').size()
    less_than_100 = session_counts[session_counts < window].index.tolist()
    print(f'Number of window with window length less than {window}: ', len(less_than_100))
    window100_dfs = df[~df['window_id'].isin(less_than_100)]
    empty_df = []
    for window_id in tqdm(less_than_100):
        sub_df = df[df['window_id'] == window_id]
        rows_to_pad = window - len(sub_df)
        min_datetime_row = sub_df.loc[sub_df['datetime'].idxmin()]
        pad_df = pd.DataFrame(min_datetime_row, df.columns).transpose()
        pad_df = pd.concat([pad_df] * int(rows_to_pad), ignore_index=True, axis=0)

        padded_df = pd.concat([pad_df, sub_df], axis=0).reset_index(drop=True)
        padded_df['window_id'] = window_id
        empty_df.append(padded_df)
    if empty_df:
        df = pd.concat(empty_df, axis=0).reset_index(drop=True)
        df = pd.concat([df, window100_dfs], axis=0).sort_values(by=['window_id']).reset_index(drop=True)
        session_counts = df.groupby('window_id').size()
        less_than_100 = session_counts[session_counts < window].index.tolist()
        print(f'Number of window with window length less than {window}: ', len(less_than_100))
        dynamic_context = df

static_context = context_data[static_context_var + ['window_id', 'session', 'datetime']]
print('number of windows', len(dynamic_context.window_id.unique().tolist()), len(static_context.window_id.unique().tolist()))
print('number of session', len(dynamic_context.session.unique().tolist()), len(static_context.session.unique().tolist()))

In [None]:
static_context_list = []
grouped_static_context = static_context.groupby('window_id')

for window, static_context_curr in tqdm(grouped_static_context):
    unique_curr = static_context_curr.drop_duplicates(subset=static_context_curr.columns.difference(['datetime']))
    if len(unique_curr) > 1:
        most_repeated_values = static_context_curr.mode().iloc[0]
        result_df = pd.DataFrame(most_repeated_values).transpose()
        result_df = result_df.reset_index(drop=True)
        static_context_list.append(result_df)
    else:
        static_context_list.append(unique_curr)

if static_context_list:
    static_context = pd.concat(static_context_list, axis=0).reset_index(drop=True)

train_dynamic_context = dynamic_context[dynamic_context['session'].isin(train_sessions)]
test_dynamic_context = dynamic_context[dynamic_context['session'].isin(test_sessions)]
train_static_context = static_context[static_context['session'].isin(train_sessions)]
test_static_context = static_context[static_context['session'].isin(test_sessions)]

train_dynamic_context['window_id'] = train_dynamic_context.groupby('window_id').ngroup()
test_dynamic_context['window_id'] = test_dynamic_context.groupby('window_id').ngroup()
train_static_context['window_id'] = train_static_context.groupby('window_id').ngroup()
test_static_context['window_id'] = test_static_context.groupby('window_id').ngroup()
# if data_autoencoder:
#     train_dynamic_context.to_csv('../pretrain/time-series-autoencoder/data/dynamic_context_train.csv', index=False)
#     test_dynamic_context.to_csv('../pretrain/time-series-autoencoder/data/dynamic_context_test.csv', index=False)

print('number of session', len(train_dynamic_context.window_id.unique().tolist()), len(test_dynamic_context.window_id.unique().tolist()),
       len(train_static_context.window_id.unique().tolist()), len(test_static_context.window_id.unique().tolist()))

train_dynamic_context.to_csv(f'{base_path}{augmentation_folder}dynamic_context/unnormal/train.csv', index=False)
test_dynamic_context.to_csv(f'{base_path}{augmentation_folder}dynamic_context/unnormal/test.csv', index=False)

In [None]:
#Normalisation
dynamic_context_to_normalize = [col for col in train_dynamic_context.columns if col not in ['window_id', 'session_ids', 'datetime', 'session_id', 'session']]
scaler_dynamic_context = MinMaxScaler()
scaler_dynamic_context.fit(train_dynamic_context[dynamic_context_to_normalize])
train_dynamic_context[dynamic_context_to_normalize] = scaler_dynamic_context.transform(train_dynamic_context[dynamic_context_to_normalize])
test_dynamic_context[dynamic_context_to_normalize] = scaler_dynamic_context.transform(test_dynamic_context[dynamic_context_to_normalize])

train_dynamic_context.to_csv(train_dynamic_context_path, index=False)
test_dynamic_context.to_csv(test_dynamic_context_path, index=False)

train_static_context.to_csv(train_static_context_path, index=False)
test_static_context.to_csv(test_static_context_path, index=False)

In [None]:
test_static_context

In [None]:
test_dynamic_context

In [None]:
train_static_context

In [None]:
train_dynamic_context

In [None]:
def session_window_mapping(df):
    session_window_dict = {}

    for index, row in df.iterrows():
        session = int(row['session'])
        window_id = int(row['window_id'])
        
        # If the session is not already in the dictionary, initialize an empty set
        if session not in session_window_dict:
            session_window_dict[session] = set()
        
        # Add the window_id to the set corresponding to the session
        session_window_dict[session].add(window_id)

    # Convert sets to lists in the resulting dictionary
    session_window_dict = {session: list(window_ids) for session, window_ids in session_window_dict.items()}
    return session_window_dict


train_session_win_id_mapping_dc = session_window_mapping(train_dynamic_context)
test_session_win_id_mapping_dc = session_window_mapping(test_dynamic_context)
train_session_win_id_mapping_sc = session_window_mapping(train_static_context)
test_session_win_id_mapping_sc = session_window_mapping(test_static_context)

with open(os.path.join(parameter_path, 'session_win_id_mapping.pkl'), 'rb') as pickle_file:
    train_session_win_id_mapping = pickle.load(pickle_file)
    test_session_win_id_mapping = pickle.load(pickle_file)

In [None]:
train_session_win_id_mapping_sc

In [None]:
train_session_win_id_mapping_dc

In [None]:
if train_session_win_id_mapping_dc == train_session_win_id_mapping_sc == train_session_win_id_mapping:
    print("All training data mapping are exactly identical.")
if test_session_win_id_mapping_dc == test_session_win_id_mapping_sc == test_session_win_id_mapping:
    print("All testing data mapping are exactly identical.")

In [None]:
print(len(train_static_context.session.unique().tolist()), len(test_static_context.session.unique().tolist()))
print(len(train_dynamic_context.window_id.unique().tolist()), len(test_dynamic_context.window_id.unique().tolist()))