In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import torch
import pickle

cross_validation_path = '../datasets/sequential/cross_validation'
path = '../datasets/sequential/featengg'
# parameter_path = '../datasets/sequential/cross_validation'
input_type = ['dense_static_context', 'dynamic_context', 'static_context']
dense_static_context_var =  ['distance_driven_benchmark', 'soc', 'time_second'] 
data_type =  ['dense_static_context', 'dynamic_context', 'static_context', 'seq']

input_data_train = []
input_data_test = []
column_names = []

for i, input_typ in enumerate(input_type):
    data_train = pd.read_csv(path+'/'+input_type[i]+'/unnormal/train.csv')
    data_test = pd.read_csv(path+'/'+input_type[i]+'/unnormal/test.csv')
    column_names.append(data_test.columns.tolist())
    input_data_train.append(data_train)
    input_data_test.append(data_test)

data_train = pd.read_csv(path+'/seq/train.tsv', sep='\t')
data_test = pd.read_csv(path+'/seq/test.tsv', sep='\t')
column_names.append(data_test.columns.tolist())
input_data_train.append(data_train)
input_data_test.append(data_test)

# for data in input_data_test:
#     print(data.columns)

window_car_id_train = input_data_train[2][['window_id', 'car_id']].drop_duplicates()
window_car_id_test = input_data_test[2][['window_id', 'car_id']].drop_duplicates()
max_window_id = input_data_train[0].window_id.max()

dataset_train, dataset_test = [], []
for i, (train, test) in enumerate(zip(input_data_train, input_data_test)):
    if i == 2:
        dataset_train.append(train)
        dataset_test.append(test)
        continue
    dataset_train.append(pd.merge(train, window_car_id_train, on='window_id', how='inner'))
    dataset_test.append(pd.merge(test, window_car_id_test, on='window_id', how='inner'))

for i, test in enumerate(dataset_test):
    test['window_id'] += int(max_window_id) + 1
    dataset_test[i] = test

dataset = []
for i, (train, test) in enumerate(zip(dataset_train, dataset_test)):
    data = pd.concat([train, test], axis=0)
    dataset.append(data)

drivers = dataset[0].car_id.unique().tolist()

cross_val_data = {}
for driver in tqdm(drivers):
    data_driver = {}
    for typ, data in zip(data_type, dataset):
        data_test = data[data['car_id']==driver]
        data_train = data[data['car_id']!=driver]
        
        if typ == 'dynamic_context':
            data_test = data_test.sort_values(by=['session', 'window_id','datetime'])
            data_train = data_train.sort_values(by=['session', 'window_id','datetime'])
        else:
            data_test = data_test.sort_values(by=['session','window_id'])
            data_train = data_train.sort_values(by=['session','window_id'])

        data_test['wind_id'] = data_test.groupby(['session', 'window_id']).ngroup()
        data_train['wind_id'] = data_train.groupby(['session', 'window_id']).ngroup()
        data_test = data_test.drop(columns='window_id')
        data_train = data_train.drop(columns='window_id')
        data_test = data_test.rename(columns={"wind_id": "window_id"})
        data_train = data_train.rename(columns={"wind_id": "window_id"})

        data_driver[typ] = {'train': data_train, 'test': data_test}
    cross_val_data[int(driver)] = data_driver

#Normalisation
def normalize_data(data, columns, scaler):
    data.loc[:, columns] = scaler.transform(data[columns])
    return data

for driver in tqdm(drivers):
    for context in ['dynamic_context', 'dense_static_context']:
        train_context = cross_val_data[driver][context]['train']
        test_context = cross_val_data[driver][context]['test']

        if context == 'dynamic_context':
            columns_to_normalize = [col for col in train_context.columns if col not in ['window_id', 'wind_id', 'session_ids', 'datetime', 'session_id', 'session', 'car_id']]
        else:
            columns_to_normalize = dense_static_context_var

        scaler = RobustScaler()
        scaler.fit(train_context[columns_to_normalize])
        cross_val_data[driver][context]['train'] =  normalize_data(train_context, columns_to_normalize, scaler)
        cross_val_data[driver][context]['test'] = normalize_data(test_context, columns_to_normalize, scaler)

def class_weight(data, driver_path):
    class_frequencies = data['item_id_target'].value_counts(normalize=True)
    total_samples = len(data)
    class_weights = {label: total_samples / (len(class_frequencies) * freq) for label, freq in class_frequencies.items()}
    class_weights[0] = 0
    class_weights[23] = 0
    sorted_class_weights = dict(sorted(class_weights.items()))
    class_weights_tensor_list = torch.tensor(list(sorted_class_weights.values()))
    dir = os.path.join(cross_validation_path, driver_path, 'parameters')
    file_path = os.path.join(dir, 'param.pkl')
    os.makedirs(dir, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(class_weights_tensor_list, f)

for driver in tqdm(drivers):
    for data in ['train', 'test']:
        driver_path = "driver_"+str(int(driver))    
        for columns, typ in zip(column_names, data_type):
            # print(cross_validation_path, driver_path, typ, data)
            dir_path = os.path.join(cross_validation_path, driver_path, typ)
            os.makedirs(dir_path, exist_ok=True)
            df = cross_val_data[driver][typ][data][columns]
            if typ == 'seq':
                file_path = os.path.join(dir_path, f'{data}.tsv')
                df.to_csv(file_path, sep='\t', index=False)
                if data == 'test':
                    class_weight(df, driver_path)
            else:
                file_path = os.path.join(dir_path, f'{data}.csv')
                df.to_csv(file_path, index=False)

# cross_val_data[6]['static_context']['train']

100%|██████████| 6/6 [00:01<00:00,  3.98it/s]
100%|██████████| 6/6 [00:00<00:00, 20.18it/s]
100%|██████████| 6/6 [00:13<00:00,  2.25s/it]
