In [1]:
import pandas as pd
import os
import random
from tqdm import tqdm
import pickle
from sklearn.preprocessing import MinMaxScaler



In [2]:
PATH_TO_LOAD = '../data/04_Merged'

merge_context_data = True
regenerate_context_data = True
sequence_augmentation = False

In [3]:
all_columns = ['index', 'avg_irradiation', 'steering_speed', 'temperature_out', 'hour',
       'month', 'odometer', 'light_sensor_rear', 'light_sensor_front',
       'temperature_in', 'KBI_speed', 'soc', 'ESP_speed', 'latitude',
       'longitude', 'seatbelt_codriver', 'seatbelt_rear_l', 'seatbelt_rear_m',
       'seatbelt_rear_r', 'CHA_ESP_drive_mode', 'CHA_MO_drive_mode',
       'rain_sensor', 'street_category', 'kickdown', 'altitude',
       'driving_program', 'datetime', 'session', 'Label', 'ID',
       'FunctionValue', 'domain', 'BeginTime', 'time_second',
       'distance_driven', 'ts_normalized', 'weekday']

selected = [ 'avg_irradiation', 'steering_speed', 'temperature_out', 'hour',
       'month', 'light_sensor_rear', 'light_sensor_front',
       'temperature_in', 'KBI_speed', 'soc', 'latitude',
       'longitude', 'seatbelt_codriver', 'seatbelt_rear_l',
       'seatbelt_rear_r', 'street_category', 'altitude',
       'datetime', 'session', 'time_second',
       'distance_driven', 'weekday'
]

bad_quality = ['CHA_ESP_drive_mode', 
             'CHA_MO_drive_mode',
             'rain_sensor',
             'kickdown',
             'ESP_speed',
             'seatbelt_rear_m',
            'driving_program',
            'ts_normalized'
             ]

dynamic_context_var = ['avg_irradiation', 'steering_speed', 'temperature_out', 'hour',
                       'light_sensor_rear', 'light_sensor_front', 
                       'temperature_in', 'KBI_speed', 'soc', 'latitude',
                       'longitude', 'seatbelt_codriver', 'seatbelt_rear_l',
                       'seatbelt_rear_r', 'street_category', 'altitude','time_second',
                       'distance_driven']
static_context_var = ['car_id', 'month', 'weekday']

In [4]:
# var = 'hour'
# test_min = {}
# test_max = {}
# for session in df.session.unique().tolist():
#     df_curr = df[df['session']==session]
#     test_min[session] = df_curr[var].min()
#     test_max[session] = df_curr[var].max()

In [5]:
## create context data

def load_context(vehicle):
    df = pd.read_csv(os.path.join(PATH_TO_LOAD, vehicle + "_merged.csv"), parse_dates=['datetime'], low_memory=False)
    context_lists = dynamic_context_var + static_context_var + ['session', 'datetime']
    context_lists.remove('car_id')
    df_filt = df[context_lists]
    df_filt = df_filt.dropna(subset=['KBI_speed'])
    df_filt_sort = df_filt.sort_values(by=['session','datetime'])
    return df_filt_sort

vehicles = ['SEB880','SEB882','SEB883','SEB885','SEB888','SEB889']
context_data = pd.DataFrame()

if merge_context_data == True:
    for vehicle in tqdm(vehicles):
        context_curr = load_context(vehicle)
        context_curr['car_id'] = vehicle
        context_data = pd.concat([context_data, context_curr], axis=0)
    context_data.to_csv('../data/05_Interaction_Sequences/context.csv')

if regenerate_context_data == True:
    # selected_sessions = merged_df['session'].unique().tolist()

    context_data = pd.read_csv('../data/05_Interaction_Sequences/context.csv', parse_dates=['datetime'], index_col=0)
    
    vehicle_list = context_data.car_id.unique().tolist()
    vehicle_dict = {vehicle: random.randint(1, 50) for vehicle in vehicle_list}
    context_data['car_id'] = context_data['car_id'].map(vehicle_dict)
    context_data = context_data.sort_values(by=['session','datetime'])
    context_data['session'] = context_data['session'].astype(int)
    static_context_var.append('session')
    static_context = context_data[static_context_var].drop_duplicates(subset=['car_id', 'session'])
    # context_data = context_data[dynamic_context]
    # selected_context = ['KBI_speed', 'car_id']
    dynamic_context_var.extend(['session', 'datetime'])
    dynamic_context = context_data[dynamic_context_var]
    dynamic_context_var = [item for item in dynamic_context_var if item not in ['session', 'datetime']]
    dynamic_context = dynamic_context.groupby(['session', 'datetime'])[dynamic_context_var].mean().reset_index()

  0%|          | 0/6 [00:00<?, ?it/s]

100%|██████████| 6/6 [01:11<00:00, 11.92s/it]


In [6]:
selected_sequence = pd.read_csv('../data/05_Interaction_Sequences/sequence_context.csv', parse_dates=['datetime'], index_col=0).reset_index()
selected_sequence['session'] = selected_sequence['session'].astype(int)

temp_df = selected_sequence.groupby('session').apply(lambda group: group.iloc[:-1]).reset_index(drop=True)
dynamic_context = dynamic_context[dynamic_context['session'].isin(temp_df.session.unique().tolist())]

training_sequence_context = pd.DataFrame()
for session in tqdm(selected_sequence['session'].unique().tolist()):
    selected_sequence_curr = selected_sequence[selected_sequence['session']==session]
    context_data_curr = dynamic_context[dynamic_context['session']==session]
    context_data_curr = context_data_curr[context_data_curr['datetime']<=selected_sequence_curr['datetime'].max()]
    training_sequence_context = pd.concat([training_sequence_context,context_data_curr], axis=0)


100%|██████████| 954/954 [03:52<00:00,  4.10it/s]


In [7]:
# testing_sessions = [16, 25]
# selected_sequence = selected_sequence[selected_sequence['session'].isin(testing_sessions)]
# training_sequence_context = training_sequence_context[training_sequence_context['session'].isin(testing_sessions)]

training_sequence_context_augmented = pd.DataFrame()
session_id = 0
if sequence_augmentation == True:
    for session in tqdm(selected_sequence['session'].unique().tolist()):
        selected_sequence_curr = selected_sequence[selected_sequence['session']==session].reset_index()
        context_curr = training_sequence_context[training_sequence_context['session']==session].reset_index()
        for i in range(len(selected_sequence_curr)-1, -1, -1):
            context_filt_curr = training_sequence_context[
                (training_sequence_context['datetime'] <= selected_sequence_curr.loc[i, 'datetime'])].copy()
            # context_filt_curr.loc[context_filt_curr.index, 'session_id'] = session_id
            context_filt_curr['session_id'] = session_id
            training_sequence_context_augmented = pd.concat([training_sequence_context_augmented, context_filt_curr], axis=0)
            session_id += 1
    dynamic_context = training_sequence_context_augmented
else:
    # if sequence_augmentation is set to false
    dynamic_context = training_sequence_context


with open('../data/05_Interaction_Sequences/train_sessions.pkl', 'rb') as pickle_file:
    train_sessions = pickle.load(pickle_file)

with open('../data/05_Interaction_Sequences/test_sessions.pkl', 'rb') as pickle_file:
    test_sessions = pickle.load(pickle_file)

train_context_data = dynamic_context[dynamic_context['session'].isin(train_sessions)]
test_context_data = dynamic_context[dynamic_context['session'].isin(test_sessions)]

train_static_context = static_context[static_context['session'].isin(train_sessions)]
test_static_context = static_context[static_context['session'].isin(test_sessions)]

train_static_context['session'] = range(len(train_static_context))
test_static_context['session'] = range(len(test_static_context))

train_context_data['session'] = train_context_data.groupby('session').ngroup()
test_context_data['session'] = test_context_data.groupby('session').ngroup()

train_context_data = train_context_data.rename(columns={'session': 'session_id'})
test_context_data = test_context_data.rename(columns={'session': 'session_id',})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_static_context['session'] = range(len(train_static_context))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_static_context['session'] = range(len(test_static_context))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_context_data['session'] = train_context_data.groupby('session').ngr

In [8]:
# Normalization of data
# train_static_context = train_static_context.round(2)
# test_static_context = test_static_context.round(2)
# train_context_data = train_context_data.round(2)
# test_context_data = test_context_data.round(2)

dynamic_context_to_normalize = [col for col in train_context_data.columns if col not in ['session_id', 'datetime']]
scaler_dynamic_context = MinMaxScaler()
scaler_dynamic_context.fit(train_context_data[dynamic_context_to_normalize])
train_context_data[dynamic_context_to_normalize] = scaler_dynamic_context.transform(train_context_data[dynamic_context_to_normalize])
test_context_data[dynamic_context_to_normalize] = scaler_dynamic_context.transform(test_context_data[dynamic_context_to_normalize])


static_context_to_normalize = [col for col in train_static_context.columns if col not in ['session']]
scaler_static_context = MinMaxScaler()
scaler_static_context.fit(train_static_context[static_context_to_normalize])
train_static_context[static_context_to_normalize] = scaler_static_context.transform(train_static_context[static_context_to_normalize])
test_static_context[static_context_to_normalize] = scaler_static_context.transform(test_static_context[static_context_to_normalize])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_static_context[static_context_to_normalize] = scaler_static_context.transform(train_static_context[static_context_to_normalize])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_static_context[static_context_to_normalize] = scaler_static_context.transform(test_static_context[static_context_to_normalize])


In [11]:
if sequence_augmentation == True:
    train_context_data.to_csv('../datasets/sequential/aug/dynamic_context/train.csv', index=False)
    test_context_data.to_csv('../datasets/sequential/aug/dynamic_context/test.csv', index=False)

    train_static_context.to_csv('../datasets/sequential/aug/static_context/train.csv', index=False)
    test_static_context.to_csv('../datasets/sequential/aug/static_context/test.csv', index=False)
    
else:
    train_context_data.to_csv('../datasets/sequential/non_aug/dynamic_context/train.csv', index=False)
    test_context_data.to_csv('../datasets/sequential/non_aug/dynamic_context/test.csv', index=False)

    train_static_context.to_csv('../datasets/sequential/non_aug/static_context/train.csv', index=False)
    test_static_context.to_csv('../datasets/sequential/non_aug/static_context/test.csv', index=False)

In [12]:
test_context_data

Unnamed: 0,session_id,datetime,avg_irradiation,steering_speed,temperature_out,hour,light_sensor_rear,light_sensor_front,temperature_in,KBI_speed,soc,latitude,longitude,seatbelt_codriver,seatbelt_rear_l,seatbelt_rear_r,street_category,altitude,time_second,distance_driven
125634,0,2022-12-29 10:10:53,0.00,0.000000,0.20,0.434783,0.009881,0.028893,0.129032,0.000000,0.102423,0.000000,0.000000,1.0,0.0,0.0,0.500000,0.183234,0.000000,0.000000
125635,0,2022-12-29 10:10:55,0.00,0.000000,0.42,0.434783,0.019763,0.057786,0.129032,0.000000,0.201826,0.000000,0.000000,1.0,0.0,0.0,1.000000,0.183234,0.000042,0.340280
125636,0,2022-12-29 10:10:56,0.00,0.000000,0.42,0.434783,0.019763,0.057786,0.306452,0.000000,0.202832,0.000000,0.000000,1.0,0.0,0.0,1.000000,0.183234,0.000063,0.340280
125637,0,2022-12-29 10:10:57,0.00,0.000000,0.42,0.434783,0.019763,0.057786,0.306452,0.000000,0.202832,0.000000,0.000000,1.0,0.0,0.0,1.000000,0.183234,0.000084,0.340280
125638,0,2022-12-29 10:10:58,0.00,0.000000,0.42,0.434783,0.019763,0.057786,0.306452,0.000000,0.201826,0.000000,0.000000,1.0,0.0,0.0,1.000000,0.183234,0.000105,0.340280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8649520,190,2023-03-23 08:36:51,0.01,0.000000,0.45,0.347826,0.043478,0.160813,0.491935,0.000000,0.836998,0.932121,0.653673,0.0,0.0,0.0,0.000000,0.449841,0.047466,0.521696
8649521,190,2023-03-23 08:36:52,0.01,0.000000,0.45,0.347826,0.043478,0.166307,0.491935,0.000000,0.836998,0.932121,0.653673,0.0,0.0,0.0,0.142857,0.449841,0.047487,0.521696
8649522,190,2023-03-23 08:36:53,0.01,0.000000,0.45,0.347826,0.043478,0.182314,0.491935,0.000000,0.836998,0.932121,0.653673,0.0,0.0,0.0,1.000000,0.449841,0.047508,0.521696
8649523,190,2023-03-23 08:36:54,0.01,0.015824,0.45,0.347826,0.043478,0.196866,0.491935,0.003306,0.836998,0.932121,0.653673,0.0,0.0,0.0,0.142857,0.449841,0.047529,0.521696
