In [1]:
import pandas as pd
import os
import random
from tqdm import tqdm
import pickle
from sklearn.preprocessing import MinMaxScaler



In [2]:
regenerate_context_data = True
sequence_augmentation = True
whole_session_context = False
model_test_run = False
data_autoencoder = False
pad_to_window_size = True

PATH_TO_LOAD = '../data/04_Merged'
combined_context_path = '../data/06_context_feat_engg/data_featue_engineering.csv'

window = 100 #seconds

base_path = '../datasets/sequential/'
augmentation_folder = 'featengg/' if sequence_augmentation else 'non_aug/'
if model_test_run:
    augmentation_folder = 'test/aug/' if sequence_augmentation else 'test/non_aug/'

sequence_context_path = f'{base_path}{augmentation_folder}parameters/sequence_context.csv'
parameter_path = f'{base_path}{augmentation_folder}parameters'
train_session_path = f'{base_path}{augmentation_folder}parameters/train_sessions.pkl'
test_session_path = f'{base_path}{augmentation_folder}parameters/test_sessions.pkl'
train_dynamic_context_path = f'{base_path}{augmentation_folder}dynamic_context/train.csv'
test_dynamic_context_path = f'{base_path}{augmentation_folder}dynamic_context/test.csv'
train_static_context_path = f'{base_path}{augmentation_folder}static_context/train.csv'
test_static_context_path = f'{base_path}{augmentation_folder}static_context/test.csv'
train_sequence_path = f'{base_path}{augmentation_folder}seq/train.tsv'
test_sequence_path = f'{base_path}{augmentation_folder}seq/test.tsv'

In [3]:
all_columns = ['index', 'avg_irradiation', 'steering_speed', 'temperature_out', 'hour',
       'month', 'odometer', 'light_sensor_rear', 'light_sensor_front',
       'temperature_in', 'KBI_speed', 'soc', 'ESP_speed', 'latitude',
       'longitude', 'seatbelt_codriver', 'seatbelt_rear_l', 'seatbelt_rear_m',
       'seatbelt_rear_r', 'CHA_ESP_drive_mode', 'CHA_MO_drive_mode',
       'rain_sensor', 'street_category', 'kickdown', 'altitude',
       'driving_program', 'datetime', 'session', 'Label', 'ID',
       'FunctionValue', 'domain', 'BeginTime', 'time_second',
       'distance_driven', 'ts_normalized', 'weekday']

selected = [ 'avg_irradiation', 'steering_speed', 'temperature_out', 'hour',
       'month', 'light_sensor_rear', 'light_sensor_front',
       'temperature_in', 'KBI_speed', 'soc', 'latitude',
       'longitude', 'seatbelt_codriver', 'seatbelt_rear_l',
       'seatbelt_rear_r', 'street_category', 'altitude',
       'datetime', 'session', 'time_second',
       'distance_driven', 'weekday'
]

bad_quality = ['CHA_ESP_drive_mode', 
             'CHA_MO_drive_mode',
             'rain_sensor',
             'kickdown',
             'ESP_speed',
             'seatbelt_rear_m',
            'driving_program',
            'ts_normalized'
             ]

dynamic_context_var = ['avg_irradiation', 'steering_speed', 'temperature_out', 
                       'light_sensor_rear', 'light_sensor_front', 
                       'temperature_in', 'KBI_speed', 
                       'latitude','longitude', 'altitude'] # todo remove these features in the future
cat_static_context_var = ['car_id', 'month', 'weekday', 'hour', 'season', 'seatbelt_codriver', 'seatbelt_rear_l', # categorical static context
                     'seatbelt_rear_r',  'street_category']
dense_static_context_var =  ['distance_driven_benchmark', 'soc', 'time_second']  # dense static context
status_static_context_var = ['ess_status', 'current_drive_mode', 'current_clima_mode', 'current_media_source', # status static context
                     'nav_guidance_status', 'proximity_to_parking_spot', 'phone_status',
                     'bluetooth_connected', 'phone_os',
                     'new_bluetooth_device_to_pair']
#todo i feel street category is higly fluctuating. might be better to ignore
vehicles = ['SEB880','SEB882','SEB883','SEB885','SEB888','SEB889']

In [4]:
context_data = pd.read_csv(combined_context_path, parse_dates=['datetime'], index_col=0, low_memory=False)
context_data_filtered = context_data[context_data['distance_driven'] != 0]
context_data_filtered['distance_driven_benchmark'] = context_data_filtered.groupby('session')['distance_driven'].transform(lambda x: x - x.min())
context_data['distance_driven_benchmark'] = context_data_filtered['distance_driven_benchmark']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  context_data_filtered['distance_driven_benchmark'] = context_data_filtered.groupby('session')['distance_driven'].transform(lambda x: x - x.min())


In [5]:
with open(train_session_path, 'rb') as pickle_file:
    train_sessions = pickle.load(pickle_file)
with open(test_session_path, 'rb') as pickle_file:
    test_sessions = pickle.load(pickle_file)
context_data = context_data[context_data['session'].isin(train_sessions + test_sessions)]

train_sequence = pd.read_csv(train_sequence_path, sep='\t', low_memory=False)
test_sequence = pd.read_csv(test_sequence_path, sep='\t', low_memory=False)
selected_sequence = pd.concat([train_sequence, test_sequence], axis=0).sort_values(['session', 'window_id'])

# selected_sequence = pd.read_csv(sequence_context_path, parse_dates=['datetime'], index_col=0)
# selected_sequence['session'] = selected_sequence['session'].astype(int)
# min_datetime_indices = selected_sequence.groupby('session')['datetime'].idxmin()
# selected_sequence = selected_sequence.drop(min_datetime_indices)
# selected_sequence.reset_index(drop=True, inplace=True)

# selected_dfs = []
# for session in tqdm(selected_sequence['session'].unique().tolist()):
#     selected_sequence_curr = selected_sequence[selected_sequence['session']==session]
#     context_data_curr = context_data[context_data['session']==session]
#     context_data_curr = context_data_curr[context_data_curr['datetime']<=selected_sequence_curr['timestamp_target_interaction'].max()]
#     selected_dfs.append(context_data_curr)
# training_sequence_context = pd.concat(selected_dfs, axis=0)
training_sequence_context = context_data

In [6]:
print(len(context_data.session.unique().tolist()), 
      len(training_sequence_context.session.unique().tolist()))

1634 1634


In [7]:
if sequence_augmentation == True:
    augmented_frames = []
    for index, row in tqdm(selected_sequence.iterrows(), total=len(selected_sequence)):
        session = row['session']
        window_id = row['window_id']
        timestamp_target_interaction = row['timestamp_target_interaction']
        training_sequence_context_curr = training_sequence_context[(training_sequence_context['session'] == session) &
                                                    (training_sequence_context['datetime'] <= timestamp_target_interaction)].copy()
        if training_sequence_context_curr.empty:
            print(session, window_id)
        if not whole_session_context and window < len(training_sequence_context_curr):
                training_sequence_context_curr = training_sequence_context_curr.tail(window)
        training_sequence_context_curr['window_id'] = window_id
        augmented_frames.append(training_sequence_context_curr)
        # print(session, window_id, timestamp_target_interaction)
        # break
    training_sequence_context_augmented = pd.concat(augmented_frames, axis=0)
    context_data = training_sequence_context_augmented.reset_index(drop=True)
    context_data['wind_id'] = context_data.groupby(['session', 'window_id']).ngroup()
else:
    # if sequence_augmentation is set to false
    if not whole_session_context:
        context_data = training_sequence_context.groupby('session').tail(window)
    context_data = training_sequence_context.reset_index(drop=True)
    context_data['window_id'] = context_data.groupby('session').ngroup()

100%|██████████| 5883/5883 [03:54<00:00, 25.13it/s]


In [8]:
print('total number of sequence data sessions: ', len(selected_sequence.session.unique().tolist()))
print('total number of Sequence data windows: ', len(train_sequence.window_id.unique().tolist()) + len(test_sequence.window_id.unique().tolist()))
print('total number of context data sessions: ', len(context_data.session.unique().tolist()))
print('total number of context data windows: ', len(context_data.wind_id.unique().tolist()))
#dont be the bothered about the total number of windows.

total number of sequence data sessions:  1634
total number of Sequence data windows:  5883
total number of context data sessions:  1634
total number of context data windows:  5883


In [9]:
# testing_sessions = [16, 25]
# selected_sequence = selected_sequence[selected_sequence['session'].isin(testing_sessions)]
# training_sequence_context = training_sequence_context[training_sequence_context['session'].isin(testing_sessions)]

# window_id = 0
# if sequence_augmentation == True:
#     grouped_selected_sequence = selected_sequence.groupby('session')
#     augmented_frames = []
#     for session, selected_sequence_curr in tqdm(grouped_selected_sequence):
#         for i, row in selected_sequence_curr.iloc[::-1].iterrows():
#             context_filt_curr = training_sequence_context[
#                 (training_sequence_context['session'] == session) &
#                 (training_sequence_context['datetime'] < row['datetime'])].copy()
#             if not whole_session_context:
#                 context_filt_curr = context_filt_curr.tail(window)
#             context_filt_curr['window_id'] = window_id
#             # context_filt_curr['session'] = session
#             augmented_frames.append(context_filt_curr)
#             window_id += 1
#     training_sequence_context_augmented = pd.concat(augmented_frames, axis=0)
#     context_data = training_sequence_context_augmented.reset_index(drop=True)
# else:
#     # if sequence_augmentation is set to false
#     if not whole_session_context:
#         context_data = training_sequence_context.groupby('session').tail(window)
#     context_data = training_sequence_context.reset_index(drop=True)
#     context_data['window_id'] = context_data.groupby('session').ngroup()

In [10]:
dynamic_context = context_data[dynamic_context_var + ['window_id', 'session', 'datetime', 'wind_id']]
print('number of dynamic context session', len(dynamic_context[['window_id', 'session']].drop_duplicates()))

# function to pad first value to fit the window size
if pad_to_window_size:
    df = dynamic_context.copy()
    session_counts = df.groupby('wind_id').size()
    less_than_100 = session_counts[session_counts < window].index.tolist()
    print(f'Number of window with window length less than {window}: ', len(less_than_100))
    window100_dfs = df[~df['wind_id'].isin(less_than_100)]
    empty_df = []
    for window_id in tqdm(less_than_100):
        sub_df = df[df['wind_id'] == window_id]
        rows_to_pad = window - len(sub_df)
        min_datetime_row = sub_df.loc[sub_df['datetime'].idxmin()]
        pad_df = pd.DataFrame(min_datetime_row, df.columns).transpose()
        pad_df = pd.concat([pad_df] * int(rows_to_pad), ignore_index=True, axis=0)

        padded_df = pd.concat([pad_df, sub_df], axis=0).reset_index(drop=True)
        # padded_df['wind_id'] = window_id
        # padded_df['window_id'] = window_id
        # padded_df['wind_id'] = window_id
        empty_df.append(padded_df)
    if empty_df:
        df = pd.concat(empty_df, axis=0).reset_index(drop=True)
        df = pd.concat([df, window100_dfs], axis=0).sort_values(by=['window_id']).reset_index(drop=True)
        session_counts = df.groupby('window_id').size()
        less_than_100 = session_counts[session_counts < window].index.tolist()
        print(f'Number of window with window length less than {window}: ', len(less_than_100))
        dynamic_context = df

number of dynamic context session 5883
Number of window with window length less than 100:  208


100%|██████████| 208/208 [00:01<00:00, 156.25it/s]


Number of window with window length less than 100:  0


In [11]:
context_data

Unnamed: 0,session,datetime,Label,avg_irradiation,steering_speed,temperature_out,light_sensor_rear,light_sensor_front,temperature_in,KBI_speed,...,nav_guidance_status,proximity_to_parking_spot,phone_status,bluetooth_connected,season,phone_os,new_bluetooth_device_to_pair,distance_driven_benchmark,window_id,wind_id
0,7.0,2022-09-07 20:59:13,,0.000000,14.890339,20.0,0.000000,0.000000,26.000000,20.442914,...,navi_inactive,no_parking_spot_closeby,unconnected,bluetooth_connected,Autumn,unknown,no bluetooth device to pair,1.0,0,0
1,7.0,2022-09-07 20:59:14,,0.000000,12.610906,20.0,0.000000,0.000000,26.000000,18.537040,...,navi_inactive,no_parking_spot_closeby,unconnected,bluetooth_connected,Autumn,unknown,no bluetooth device to pair,1.0,0,0
2,7.0,2022-09-07 20:59:14,,0.000000,12.610906,20.0,0.000000,0.000000,26.000000,18.537040,...,navi_inactive,no_parking_spot_closeby,unconnected,bluetooth_connected,Autumn,unknown,no bluetooth device to pair,1.0,0,0
3,7.0,2022-09-07 20:59:16,,0.000000,24.653542,20.0,0.000000,0.000000,26.000000,16.320000,...,navi_inactive,no_parking_spot_closeby,unconnected,bluetooth_connected,Autumn,unknown,no bluetooth device to pair,1.0,0,0
4,7.0,2022-09-07 20:59:17,,0.000000,99.878832,20.0,0.000000,0.000000,26.160953,13.760000,...,navi_inactive,no_parking_spot_closeby,unconnected,bluetooth_connected,Autumn,unknown,no bluetooth device to pair,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579667,5399.0,2023-03-31 16:08:14,,30.000000,0.000000,9.0,5600.000000,876.000000,27.500000,39.040000,...,navi_inactive,no_parking_spot_closeby,unconnected,bluetooth_unconnected,Spring,unconnected,no bluetooth device to pair,31.0,4623,5882
579668,5399.0,2023-03-31 16:08:15,,30.000000,0.000000,9.0,5629.085099,1026.544147,27.500000,40.804305,...,navi_inactive,no_parking_spot_closeby,unconnected,bluetooth_unconnected,Spring,unconnected,no bluetooth device to pair,31.0,4623,5882
579669,5399.0,2023-03-31 16:08:16,,22.610694,102.026156,9.0,6370.064358,1773.267177,27.500000,42.984007,...,navi_inactive,no_parking_spot_closeby,unconnected,bluetooth_unconnected,Spring,unconnected,no bluetooth device to pair,31.0,4623,5882
579670,5399.0,2023-03-31 16:08:17,,12.616554,40.000000,9.0,6800.000000,2124.000000,27.500000,49.132555,...,navi_inactive,no_parking_spot_closeby,unconnected,bluetooth_unconnected,Spring,unconnected,no bluetooth device to pair,31.0,4623,5882


In [12]:
dense_status_static_context = context_data[dense_static_context_var + status_static_context_var + ['window_id', 'session', 'datetime', 'wind_id']]
dense_status_static_context = dense_status_static_context.sort_values(by=['wind_id','datetime'], ascending=False)
dense_status_static_context = dense_status_static_context.groupby('wind_id').first()
dense_status_static_context.reset_index(inplace=True)
for col in status_static_context_var:
    dense_status_static_context[col], _ = pd.factorize(dense_status_static_context[col])
dense_status_static_context = dense_status_static_context.sort_values(by='wind_id')

In [13]:
for col in status_static_context_var:
    print(col)
    print(sorted(dense_status_static_context[col].unique().tolist()))

ess_status
[0, 1]
current_drive_mode
[0, 1, 2, 3]
current_clima_mode
[0, 1, 2, 3]
current_media_source
[0, 1, 2, 3, 4]
nav_guidance_status
[0, 1]
proximity_to_parking_spot
[0, 1]
phone_status
[0, 1]
bluetooth_connected
[0, 1]
phone_os
[0, 1, 2, 3]
new_bluetooth_device_to_pair
[0, 1]


In [14]:
cat_static_context = context_data[cat_static_context_var + ['window_id', 'session', 'datetime', 'wind_id']]
cat_static_context = cat_static_context.groupby('wind_id').apply(lambda x: x.mode().iloc[0]).reset_index(drop=True)
cat_static_context = cat_static_context.sort_values(by='wind_id')
static_context = pd.merge(cat_static_context, dense_status_static_context, on=['wind_id', 'window_id', 'session'], how='inner')
print('number of windows', len(dynamic_context.wind_id.unique().tolist()), len(static_context.wind_id.unique().tolist()))
print('number of session', len(dynamic_context.session.unique().tolist()), len(static_context.session.unique().tolist()))

# static_context_list = []
# grouped_static_context = static_context.groupby('wind_id')
# for window, static_context_curr in tqdm(grouped_static_context):
#     unique_curr = static_context_curr.drop_duplicates(subset=static_context_curr.columns.difference(['datetime']))
#     if len(unique_curr) > 1:
#         most_repeated_values = static_context_curr.mode().iloc[0]
#         result_df = pd.DataFrame(most_repeated_values).transpose()
#         result_df = result_df.reset_index(drop=True)
#         static_context_list.append(result_df)
#     else:
#         static_context_list.append(unique_curr)
#     static_context = pd.concat(static_context_list, axis=0).reset_index(drop=True)
# static_context

number of windows 5883 5883
number of session 1634 1634


In [15]:
static_context

Unnamed: 0,car_id,month,weekday,hour,season,seatbelt_codriver,seatbelt_rear_l,seatbelt_rear_r,street_category,window_id,...,current_drive_mode,current_clima_mode,current_media_source,nav_guidance_status,proximity_to_parking_spot,phone_status,bluetooth_connected,phone_os,new_bluetooth_device_to_pair,datetime_y
0,1.0,9.0,2.0,21.0,Autumn,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,2022-09-07 21:00:51
1,1.0,9.0,4.0,18.0,Autumn,0.0,1.0,0.0,7.0,0.0,...,0,0,1,0,0,0,0,0,0,2022-09-09 18:43:10
2,1.0,9.0,4.0,18.0,Autumn,0.0,1.0,0.0,4.0,1.0,...,0,0,2,0,0,0,0,0,0,2022-09-09 18:24:03
3,1.0,9.0,4.0,18.0,Autumn,0.0,1.0,0.0,5.0,2.0,...,0,0,1,0,0,0,0,0,0,2022-09-09 18:02:19
4,1.0,9.0,5.0,17.0,Autumn,0.0,1.0,0.0,1.0,1.0,...,0,0,0,0,0,0,1,1,0,2022-09-10 17:15:06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5878,6.0,3.0,1.0,12.0,Spring,0.0,0.0,0.0,7.0,1258.0,...,0,0,0,1,0,0,1,0,0,2023-03-28 12:54:10
5879,6.0,3.0,1.0,14.0,Spring,0.0,0.0,0.0,5.0,4620.0,...,0,0,0,0,0,0,1,1,0,2023-03-28 14:16:00
5880,6.0,3.0,1.0,18.0,Spring,0.0,0.0,0.0,0.0,4621.0,...,0,0,1,0,0,0,1,0,0,2023-03-28 18:38:29
5881,6.0,3.0,4.0,16.0,Spring,1.0,1.0,1.0,3.0,4622.0,...,0,0,0,0,0,0,1,1,0,2023-03-31 16:19:44


In [16]:
train_dynamic_context = dynamic_context[dynamic_context['session'].isin(train_sessions)]
test_dynamic_context = dynamic_context[dynamic_context['session'].isin(test_sessions)]
train_static_context = static_context[static_context['session'].isin(train_sessions)]
test_static_context = static_context[static_context['session'].isin(test_sessions)]

# train_dynamic_context['window_id'] = train_dynamic_context.groupby('window_id').ngroup()
# test_dynamic_context['window_id'] = test_dynamic_context.groupby('window_id').ngroup()
# train_static_context['window_id'] = train_static_context.groupby('window_id').ngroup()
# test_static_context['window_id'] = test_static_context.groupby('window_id').ngroup()
# if data_autoencoder:
#     train_dynamic_context.to_csv('../pretrain/time-series-autoencoder/data/dynamic_context_train.csv', index=False)
#     test_dynamic_context.to_csv('../pretrain/time-series-autoencoder/data/dynamic_context_test.csv', index=False)

print('number of session', len(train_dynamic_context.window_id.unique().tolist()), len(test_dynamic_context.window_id.unique().tolist()),
       len(train_static_context.window_id.unique().tolist()), len(test_static_context.window_id.unique().tolist()))

train_dynamic_context.to_csv(f'{base_path}{augmentation_folder}dynamic_context/unnormal/train.csv', index=False)
test_dynamic_context.to_csv(f'{base_path}{augmentation_folder}dynamic_context/unnormal/test.csv', index=False)

number of session 4624 1259 4624 1259


In [17]:
#Normalisation
dynamic_context_to_normalize = [col for col in train_dynamic_context.columns if col not in ['window_id', 'session_ids', 'datetime', 'session_id', 'session']]
scaler_dynamic_context = MinMaxScaler()
scaler_dynamic_context.fit(train_dynamic_context[dynamic_context_to_normalize])
train_dynamic_context[dynamic_context_to_normalize] = scaler_dynamic_context.transform(train_dynamic_context[dynamic_context_to_normalize])
test_dynamic_context[dynamic_context_to_normalize] = scaler_dynamic_context.transform(test_dynamic_context[dynamic_context_to_normalize])


scaler_dense_static_context = MinMaxScaler()
scaler_dense_static_context.fit(train_static_context[dense_static_context_var])
train_static_context[dense_static_context_var] = scaler_dense_static_context.transform(train_static_context[dense_static_context_var])
test_static_context[dense_static_context_var] = scaler_dense_static_context.transform(test_static_context[dense_static_context_var])

train_dynamic_context.to_csv(train_dynamic_context_path, index=False)
test_dynamic_context.to_csv(test_dynamic_context_path, index=False)

train_static_context.to_csv(train_static_context_path, index=False)
test_static_context.to_csv(test_static_context_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dynamic_context[dynamic_context_to_normalize] = scaler_dynamic_context.transform(train_dynamic_context[dynamic_context_to_normalize])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dynamic_context[dynamic_context_to_normalize] = scaler_dynamic_context.transform(test_dynamic_context[dynamic_context_to_normalize])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pand

In [18]:
test_static_context

Unnamed: 0,car_id,month,weekday,hour,season,seatbelt_codriver,seatbelt_rear_l,seatbelt_rear_r,street_category,window_id,...,current_drive_mode,current_clima_mode,current_media_source,nav_guidance_status,proximity_to_parking_spot,phone_status,bluetooth_connected,phone_os,new_bluetooth_device_to_pair,datetime_y
1,1.0,9.0,4.0,18.0,Autumn,0.0,1.0,0.0,7.0,0.0,...,0,0,1,0,0,0,0,0,0,2022-09-09 18:43:10
2,1.0,9.0,4.0,18.0,Autumn,0.0,1.0,0.0,4.0,1.0,...,0,0,2,0,0,0,0,0,0,2022-09-09 18:24:03
3,1.0,9.0,4.0,18.0,Autumn,0.0,1.0,0.0,5.0,2.0,...,0,0,1,0,0,0,0,0,0,2022-09-09 18:02:19
10,1.0,9.0,3.0,9.0,Autumn,0.0,0.0,0.0,7.0,3.0,...,0,0,0,0,0,0,1,1,0,2022-09-15 09:43:45
18,1.0,9.0,6.0,11.0,Autumn,1.0,1.0,1.0,2.0,4.0,...,1,0,0,0,0,0,1,1,0,2022-09-25 12:00:20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5852,6.0,3.0,3.0,9.0,Spring,0.0,0.0,0.0,5.0,1254.0,...,0,0,0,1,0,0,1,1,0,2023-03-16 09:25:17
5853,6.0,3.0,3.0,9.0,Spring,0.0,0.0,0.0,5.0,1255.0,...,0,0,0,1,0,0,1,1,0,2023-03-16 09:24:28
5854,6.0,3.0,3.0,9.0,Spring,0.0,0.0,0.0,7.0,1256.0,...,1,0,0,1,0,0,1,1,0,2023-03-16 09:01:33
5855,6.0,3.0,3.0,8.0,Spring,0.0,0.0,0.0,7.0,1257.0,...,0,0,0,1,0,0,1,1,0,2023-03-16 08:57:04


In [19]:
test_dynamic_context

Unnamed: 0,avg_irradiation,steering_speed,temperature_out,light_sensor_rear,light_sensor_front,temperature_in,KBI_speed,latitude,longitude,altitude,window_id,session,datetime,wind_id
1,0.000000,0.000000,0.652174,0.031621,0.125367,0.591837,0.000000,0.930604,0.675379,0.392227,0,16.0,2022-09-09 18:41:58,0.00017
2,0.000000,0.000000,0.652174,0.031621,0.126347,0.591837,0.000000,0.930604,0.675379,0.392227,0,16.0,2022-09-09 18:41:59,0.00017
3,0.000000,0.000000,0.652174,0.031621,0.126347,0.591837,0.000000,0.930604,0.675379,0.392227,0,16.0,2022-09-09 18:42:00,0.00017
4,0.000000,0.000000,0.652174,0.031621,0.127326,0.591837,0.000000,0.930604,0.675379,0.392227,0,16.0,2022-09-09 18:42:01,0.00017
5,0.000000,0.000000,0.652174,0.031621,0.127326,0.591837,0.000000,0.930604,0.675379,0.392227,0,16.0,2022-09-09 18:42:02,0.00017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251795,0.185320,0.000000,0.402174,0.382348,1.000000,0.591837,0.555980,0.931458,0.665268,0.371892,1258,5387.0,2023-03-28 12:53:39,0.99932
251796,0.215053,0.000000,0.402174,0.363287,1.000000,0.591837,0.555980,0.931462,0.665298,0.371609,1258,5387.0,2023-03-28 12:53:40,0.99932
251797,0.220000,0.003620,0.402174,0.347481,1.000000,0.589357,0.555980,0.931466,0.665328,0.370479,1258,5387.0,2023-03-28 12:53:41,0.99932
251798,0.126818,0.000000,0.402174,0.458498,1.000000,0.591837,0.557252,0.931441,0.665107,0.376128,1258,5387.0,2023-03-28 12:53:34,0.99932


In [20]:
train_static_context

Unnamed: 0,car_id,month,weekday,hour,season,seatbelt_codriver,seatbelt_rear_l,seatbelt_rear_r,street_category,window_id,...,current_drive_mode,current_clima_mode,current_media_source,nav_guidance_status,proximity_to_parking_spot,phone_status,bluetooth_connected,phone_os,new_bluetooth_device_to_pair,datetime_y
0,1.0,9.0,2.0,21.0,Autumn,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,2022-09-07 21:00:51
4,1.0,9.0,5.0,17.0,Autumn,0.0,1.0,0.0,1.0,1.0,...,0,0,0,0,0,0,1,1,0,2022-09-10 17:15:06
5,1.0,9.0,6.0,10.0,Autumn,1.0,1.0,1.0,1.0,2.0,...,0,0,3,0,0,0,1,1,0,2022-09-11 10:28:01
6,1.0,9.0,6.0,12.0,Autumn,1.0,1.0,1.0,7.0,3.0,...,0,0,0,0,0,0,0,0,0,2022-09-11 12:26:02
7,1.0,9.0,0.0,17.0,Autumn,0.0,0.0,0.0,0.0,4.0,...,0,0,0,0,0,0,1,1,0,2022-09-12 17:09:43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5877,6.0,3.0,1.0,10.0,Spring,0.0,0.0,0.0,7.0,4619.0,...,0,0,0,0,0,0,1,1,0,2023-03-28 10:10:12
5879,6.0,3.0,1.0,14.0,Spring,0.0,0.0,0.0,5.0,4620.0,...,0,0,0,0,0,0,1,1,0,2023-03-28 14:16:00
5880,6.0,3.0,1.0,18.0,Spring,0.0,0.0,0.0,0.0,4621.0,...,0,0,1,0,0,0,1,0,0,2023-03-28 18:38:29
5881,6.0,3.0,4.0,16.0,Spring,1.0,1.0,1.0,3.0,4622.0,...,0,0,0,0,0,0,1,1,0,2023-03-31 16:19:44


In [21]:
train_dynamic_context

Unnamed: 0,avg_irradiation,steering_speed,temperature_out,light_sensor_rear,light_sensor_front,temperature_in,KBI_speed,latitude,longitude,altitude,window_id,session,datetime,wind_id
0,0.000000,0.018372,0.641304,0.000000,0.000000,0.581633,0.081277,0.932435,0.646192,0.434025,0,7.0,2022-09-07 20:59:13,0.0
71,0.000000,0.015560,0.641304,0.000000,0.000000,0.581633,0.073700,0.932434,0.646185,0.434025,0,7.0,2022-09-07 20:59:14,0.0
101,0.000000,0.054422,0.619565,0.000000,0.000000,0.591837,0.080153,0.932491,0.646249,0.424205,0,7.0,2022-09-07 21:00:51,0.0
102,0.000000,0.024572,0.630435,0.000000,0.000000,0.591837,0.064042,0.932450,0.646111,0.427421,0,7.0,2022-09-07 20:59:40,0.0
103,0.000000,0.000643,0.630435,0.000000,0.000000,0.591837,0.075064,0.932451,0.646112,0.427247,0,7.0,2022-09-07 20:59:42,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
588295,0.037789,0.017697,0.423913,0.094862,0.396200,0.612245,0.288852,0.926798,0.703405,0.598739,4623,5399.0,2023-03-31 16:07:06,1.0
588296,0.030000,0.007759,0.423913,0.083270,0.418086,0.612245,0.315599,0.926799,0.703386,0.597609,4623,5399.0,2023-03-31 16:07:05,1.0
588297,0.040000,0.041608,0.423913,0.083004,0.413427,0.612245,0.329823,0.926800,0.703344,0.595349,4623,5399.0,2023-03-31 16:07:03,1.0
588298,0.030000,0.000000,0.413043,0.103075,0.618479,0.612245,0.239711,0.926782,0.703476,0.604387,4623,5399.0,2023-03-31 16:07:14,1.0


In [22]:
def session_window_mapping(df):
    session_window_dict = {}

    for index, row in df.iterrows():
        session = int(row['session'])
        window_id = int(row['window_id'])
        
        # If the session is not already in the dictionary, initialize an empty set
        if session not in session_window_dict:
            session_window_dict[session] = set()
        
        # Add the window_id to the set corresponding to the session
        session_window_dict[session].add(window_id)

    # Convert sets to lists in the resulting dictionary
    session_window_dict = {session: list(window_ids) for session, window_ids in session_window_dict.items()}
    return session_window_dict


train_session_win_id_mapping_dc = session_window_mapping(train_dynamic_context)
test_session_win_id_mapping_dc = session_window_mapping(test_dynamic_context)
train_session_win_id_mapping_sc = session_window_mapping(train_static_context)
test_session_win_id_mapping_sc = session_window_mapping(test_static_context)

with open(os.path.join(parameter_path, 'session_win_id_mapping.pkl'), 'rb') as pickle_file:
    train_session_win_id_mapping = pickle.load(pickle_file)
    test_session_win_id_mapping = pickle.load(pickle_file)

In [23]:
len(train_session_win_id_mapping_sc) + len(test_session_win_id_mapping_sc)

1634

In [24]:
train_session_win_id_mapping_sc

{7: [0],
 20: [1],
 25: [2],
 27: [3],
 33: [4],
 35: [5],
 40: [6],
 45: [7],
 46: [8],
 50: [9, 10, 11],
 52: [12, 13],
 150: [14, 15, 16, 17, 18],
 151: [19],
 163: [20, 21],
 164: [24, 22, 23],
 172: [25, 26],
 178: [27],
 183: [28, 29, 30],
 186: [32, 33, 34, 31],
 188: [35],
 191: [36],
 192: [37],
 195: [38],
 196: [39],
 197: [40],
 199: [41, 42, 43],
 202: [44],
 205: [45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
 206: [56, 57, 58, 59],
 210: [60],
 211: [61],
 252: [62],
 253: [63],
 255: [64, 65, 66, 67],
 256: [68, 69],
 258: [70, 71],
 270: [72],
 274: [73],
 276: [74],
 277: [75, 76, 77, 78],
 357: [79],
 385: [80],
 387: [81, 82, 83, 84, 85, 86, 87],
 397: [88, 89, 90],
 401: [91],
 404: [92, 93],
 405: [94, 95],
 406: [96],
 408: [97, 98, 99],
 411: [100, 101],
 413: [102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127],
 416: [128, 129],
 418: 

In [25]:
train_session_win_id_mapping_dc

{7: [0],
 20: [1],
 25: [2],
 27: [3],
 33: [4],
 35: [5],
 40: [6],
 45: [7],
 46: [8],
 50: [9, 10, 11],
 52: [12, 13],
 150: [14, 15, 16, 17, 18],
 151: [19],
 163: [20, 21],
 164: [24, 22, 23],
 172: [25, 26],
 178: [27],
 183: [28, 29, 30],
 186: [32, 33, 34, 31],
 188: [35],
 191: [36],
 192: [37],
 195: [38],
 196: [39],
 197: [40],
 199: [41, 42, 43],
 202: [44],
 205: [45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
 206: [56, 57, 58, 59],
 210: [60],
 211: [61],
 252: [62],
 253: [63],
 255: [64, 65, 66, 67],
 256: [68, 69],
 258: [70, 71],
 270: [72],
 274: [73],
 276: [74],
 277: [75, 76, 77, 78],
 357: [79],
 385: [80],
 387: [81, 82, 83, 84, 85, 86, 87],
 397: [88, 89, 90],
 401: [91],
 404: [92, 93],
 405: [94, 95],
 406: [96],
 408: [97, 98, 99],
 411: [100, 101],
 413: [102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127],
 416: [128, 129],
 418: 

In [26]:
print(len(train_session_win_id_mapping_dc), len(train_session_win_id_mapping_sc), len(train_session_win_id_mapping))

1308 1308 1308


In [27]:
if train_session_win_id_mapping_dc == train_session_win_id_mapping_sc == train_session_win_id_mapping:
    print("All training data mapping are exactly identical.")
if test_session_win_id_mapping_dc == test_session_win_id_mapping_sc == test_session_win_id_mapping:
    print("All testing data mapping are exactly identical.")

All training data mapping are exactly identical.
All testing data mapping are exactly identical.


In [28]:
print(len(train_dynamic_context.session.unique().tolist()), len(test_dynamic_context.session.unique().tolist()))
print(len(train_dynamic_context.window_id.unique().tolist()), len(test_dynamic_context.window_id.unique().tolist()))
print(len(train_static_context.session.unique().tolist()), len(test_static_context.session.unique().tolist()))
print(len(train_static_context.window_id.unique().tolist()), len(test_static_context.window_id.unique().tolist()))

1308 326
4624 1259
1308 326
4624 1259
