In [1]:
import pandas as pd
import os
import sys
import re
import json
import concurrent.futures
from tqdm import tqdm
import random
import pickle
from sklearn.model_selection import train_test_split



In [2]:
PATH_TO_LOAD = '../data/04_Merged'
sequence_augmentation = False
text_num_mapping_start = 2

In [3]:
def load_df(vehicle):
    df = pd.read_csv(os.path.join(PATH_TO_LOAD, vehicle + "_merged.csv"), parse_dates=['datetime'], low_memory=False, index_col=0)
    df = df.dropna(subset=['Label'])
    df = df.sort_values(by=['session','datetime'])
    df['full_label'] = df['Label'] + ' ' + df['FunctionValue']
    return df

full_df = pd.DataFrame()

vehicles = ['SEB880','SEB882','SEB883','SEB885','SEB888','SEB889']
# for vehicle in tqdm(vehicles):
#     df_curr = load_df(vehicle)
#     df_curr['vehicle'] = vehicle
#     df_curr = df_curr.dropna(subset=['full_label'])
#     full_df = pd.concat([full_df, df_curr], ignore_index=True)

def process_vehicle(vehicle):
    df_curr = load_df(vehicle)
    df_curr['vehicle'] = vehicle
    df_curr = df_curr.dropna(subset=['full_label'])
    return df_curr

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_vehicle, vehicles), total=len(vehicles)))

full_df = pd.concat(results, ignore_index=True)

mapping = {category: index + text_num_mapping_start for index, category in enumerate(full_df['full_label'].unique())}
full_df['full_label_num'] = full_df['full_label'].replace(mapping)

mapping_vehicle = {category: index + text_num_mapping_start for index, category in enumerate(full_df['vehicle'].unique())}
full_df['vehicle_num'] = full_df['vehicle'].replace(mapping_vehicle)

filt_df = full_df[['session','full_label_num','vehicle_num', 'datetime']].sort_values(by = ['session', 'datetime'])
filt_df = filt_df.drop_duplicates()

filt_df['interaction_time_delta'] = (filt_df.groupby('session')['datetime'].diff().dt.total_seconds()/60).round(1)
filt_df['interaction_time_delta'] = filt_df['interaction_time_delta'].fillna(0)
filt_df['interaction_time_delta'] = filt_df['interaction_time_delta'].astype(int)

100%|██████████| 6/6 [00:32<00:00,  5.47s/it]


In [4]:
# To do add those session with just one interactions. that one interaction can the target and input sequence can be no click along with the context
# find session with just one interactions
session_counts = filt_df['session'].value_counts()
session_with_one_interactions = session_counts[session_counts == 1].index.tolist()
one_interactions = filt_df[filt_df['session'].isin(session_with_one_interactions)]
one_interactions

Unnamed: 0,session,full_label_num,vehicle_num,datetime,interaction_time_delta
0,7.0,2,2,2022-09-07 21:00:51,0
6,20.0,3,2,2022-09-10 17:15:06,0
10,27.0,2,2,2022-09-11 12:26:02,0
19,45.0,5,2,2022-09-16 07:13:03,0
20,46.0,9,2,2022-09-16 11:36:04,0
...,...,...,...,...,...
6128,5373.0,3,7,2023-03-25 15:20:25,0
6129,5375.0,3,7,2023-03-26 08:29:12,0
6131,5384.0,3,7,2023-03-28 08:10:54,0
6132,5385.0,3,7,2023-03-28 09:06:58,0


In [5]:
relevanat_df = filt_df[~filt_df['session'].isin(session_with_one_interactions)]
len(relevanat_df.session.unique().tolist())
relevanat_df[['session', 'datetime']].to_csv('../data/05_Interaction_Sequences/sequence_context.csv')

In [6]:
## Generating augmented data
def explode_both(row):
        sequences = row['sequence']
        time_deltas = row['time_delta']
        sessions = [row['session']] * len(sequences)
        return pd.DataFrame({'session': sessions, 'sequence': sequences, 'time_delta': time_deltas})

def sequence_generation(df, sequence_augmentation):
    sequence_dict = {
        'session': [],
        'sequence': [],
        'time_delta': []
    }
    if sequence_augmentation == True:
        for session in df['session'].unique().tolist():
            check_df = df[df['session']== session]

            sequence_list = []
            time_delta_list = []
            seq_length = len(check_df)
            sequence = check_df['full_label_num'].tolist()
            time_delta = check_df['interaction_time_delta'].tolist()
            # print(session)
            # print(seq_length)
            # print(sequence)
            # print(time_delta)
            while seq_length != 1:
                sequence_list.append(sequence)
                time_delta_list.append(time_delta)
                # print(sequence_list)
                # print(time_delta_list)
                time_delta = time_delta[:-1]
                sequence = sequence[:-1]
                seq_length = seq_length -1
            sequence_dict['session'].append(session)
            sequence_dict['sequence'].append(sequence_list)
            sequence_dict['time_delta'].append(time_delta_list)
        sequence_df = pd.DataFrame(sequence_dict)
        sequence_df = pd.concat(sequence_df.apply(explode_both, axis=1).tolist(), ignore_index=True)
    else:
        for session in df['session'].unique().tolist():
            check_df = df[df['session']== session]
            
            if len(check_df) == 1:
                 continue
            sequence_list = []
            time_delta_list = []
            seq_length = len(check_df)
            sequence = check_df['full_label_num'].tolist()
            time_delta = check_df['interaction_time_delta'].tolist()
            
            sequence_dict['session'].append(session)
            sequence_dict['sequence'].append(sequence)
            sequence_dict['time_delta'].append(time_delta)
        sequence_df = pd.DataFrame(sequence_dict)

    return sequence_df

test_df = test_df = filt_df.drop(columns=['datetime', 'vehicle_num'])
df_exploded = sequence_generation(test_df, sequence_augmentation)

df_exploded['time_delta_list'] = df_exploded['time_delta'].apply(lambda x: x[1:] if isinstance(x, list) and len(x) > 1 else x)
df_exploded['interaction_time_delta_train'] = df_exploded['time_delta_list'].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else x)
df_exploded['item_id_seq_train'] = df_exploded['sequence'].apply(lambda x: ' '.join(map(str, x[:-1])) if isinstance(x, list) and len(x) > 1 else None)
df_exploded['item_id_target'] = df_exploded['sequence'].apply(lambda x: x[-1] if isinstance(x, list) and len(x) > 0 else None)
df_exploded = df_exploded.dropna(subset=['item_id_target'])
df_exploded['item_id_target'] = df_exploded['item_id_target'].astype(int)
df_exploded = df_exploded.drop(columns=['sequence', 'time_delta', 'time_delta_list'])

In [7]:
df_exploded

Unnamed: 0,session,interaction_time_delta_train,item_id_seq_train,item_id_target
0,16.0,10 21 19,3 2 3,3
1,25.0,0,4,3
2,33.0,0,5,6
3,35.0,0,5,7
4,40.0,0,5,8
...,...,...,...,...
949,5341.0,4 7,3 3,9
950,5346.0,37 0 23,470 475 470,3
951,5364.0,23 0 12,3 5 7,3
952,5387.0,0,470,9


In [11]:
if sequence_augmentation == True:
      total_sessions = df_exploded.session.unique().tolist()
      test_sessions, train_sessions = train_test_split(total_sessions, test_size=0.8, shuffle=True, random_state=42)
      train_df = df_exploded[df_exploded['session'].isin(train_sessions)].sort_index()
      test_df = df_exploded[df_exploded['session'].isin(test_sessions)].sort_index()
else:
      train_df, test_df = train_test_split(df_exploded, test_size=0.2, shuffle=True, random_state=42)
      train_df = train_df.sort_index()
      test_df = test_df.sort_index()
      train_sessions = train_df['session'].unique().tolist()
      test_sessions = test_df['session'].unique().tolist()

with open('../data/05_Interaction_Sequences/train_sessions.pkl', 'wb') as pickle_file:
        pickle.dump(train_sessions, pickle_file)

with open('../data/05_Interaction_Sequences/test_sessions.pkl', 'wb') as pickle_file:
        pickle.dump(test_sessions, pickle_file)

# train_df = train_df.sort_values(by='session')
# test_df = test_df.sort_values(by='session')

train_df = train_df.drop(['session'], axis=1)
test_df = test_df.drop(['session'], axis=1)

test_df['session_id'] = range(len(test_df))
test_df['session_id'] = test_df['session_id'].astype(int)

train_df['session_id'] = range(len(train_df))
train_df['session_id'] = train_df['session_id'].astype(int)

train_df = train_df[['session_id', 'item_id_seq_train', 'item_id_target', 'interaction_time_delta_train']]
test_df = test_df[['session_id', 'item_id_seq_train', 'item_id_target', 'interaction_time_delta_train']]

if sequence_augmentation == True:
    test_df.to_csv('../datasets/sequential/carsii_timedelta_rand_seq/aug/test.tsv', sep='\t', index=False)
    train_df.to_csv('../datasets/sequential/carsii_timedelta_rand_seq/aug/train.tsv', sep='\t', index=False)
else:
    test_df.to_csv('../datasets/sequential/carsii_timedelta_rand_seq/test.tsv', sep='\t', index=False)
    train_df.to_csv('../datasets/sequential/carsii_timedelta_rand_seq/train.tsv', sep='\t', index=False)