# Dataset Preparation for Deep Learning Models
1. RAW data (.csv) to Preprocessed data (.csv)  
2. Preprocessed data (.csv) to extracted time window files (.npz)  
    * 2a. save labelled data
    * 2b. save unlabelled data

In [None]:
import os
import glob
import pandas as pd
from IPython.display import display

import sys
sys.path.append("../") # Set parent directory to sys.path
sys.dont_write_bytecode = True
%load_ext autoreload
%autoreload 2
from src.data_preprocess_logbot import (
    get_raw_date_information,
    read_raw_data_and_refine_timestamp,
    divide_df_if_timestamp_gap_detected,
    run_resampling_and_concat_df,
    preprocess_sensor_data,
    save_preprocessed_data,
    extract_sliding_windows,
    extract_sliding_windows_v2,
    save_labelled_windows_as_npz,
    get_shuffled_list,
    save_blocks_of_windows_as_npz,
)
from src import utils
(
    plot_parameters, okabe_ito_color_list, tol_bright_color_list
) = utils.setup_plot(show_color_palette=False)

In [None]:
dataset_dir = f"/home/bob/protein/dl-wabc/data/datasets/logbot_data"

In [None]:
debug_test_mode = True # Do not save data
# debug_test_mode = False # Save data

## 1. Raw Data to Preprocessed Files

In [None]:
# species = "omizunagidori"
species = "umineko"

In [None]:
output_dir_path = f"{dataset_dir}/preprocessed_data/"
print("output directory:", output_dir_path)
print(" ")

path_target = f"{dataset_dir}/raw_data/{species}/**.csv"
raw_data_path_list = sorted(glob.glob(path_target))
print(f"N of raw data csv files: {len(raw_data_path_list)}", )
for i, raw_data_path in enumerate(raw_data_path_list):
    print(f"{i:0=2}: {os.path.basename(raw_data_path)}")
    
animal_id_path = f"{dataset_dir}/id_files/v1.0.0/animal_id.csv"
print("animal_id_path:", animal_id_path)
df_animal_id = pd.read_csv(animal_id_path)
display(df_animal_id.head(5))

In [None]:
for raw_data_path in raw_data_path_list:
    (
        species, 
        year, 
        animal_tag, 
        animal_id,
        acc_sampling_rate, 
        correct_timestamp, 
        back_mount
    ) = get_raw_date_information(raw_data_path, 
                                 animal_id_path)
    df = read_raw_data_and_refine_timestamp(raw_data_path, 
                                            correct_timestamp)
    df_list = divide_df_if_timestamp_gap_detected(df, 
                                                  acc_sampling_rate, 
                                                  gap_min_limit=5) 
    # Note: this will be 25*5 = 125 (120 + 5) min gap in divide_df_if_timestamp_gap_detected
    
    df = run_resampling_and_concat_df(df_list, 
                                      acc_sampling_rate, 
                                      remove_sec=3, 
                                      check_df=False)

    df = preprocess_sensor_data(df, 
                                clipping=True, 
                                clipping_threshold=8, 
                                method="none", 
                                check_df=False)
    
    if debug_test_mode == True:
        print(f"| debug mode -> do not save data |")
    else:
        save_preprocessed_data(df, 
                               output_dir_path, 
                               species, 
                               animal_id)
        
print(f"-----------------------------------")
print(f"raw data preprocessing completed !")
print(f"-----------------------------------")

## 2. Preprocessed CSV files to NPZ files
extract sliding window and save as npz

### 2a. Extract labelled data

In [None]:
# species = "omizunagidori"
species = "umineko"

In [None]:
target_path = f"{dataset_dir}/preprocessed_data/{species}/**.csv"
preprocessed_data_path_list = sorted(glob.glob(target_path))
print("input_dir: ")
counter = 0
for preprocessed_data_path in preprocessed_data_path_list:
    print(str(counter).zfill(2), ": ", os.path.basename(preprocessed_data_path))
    counter = counter + 1
print("Length of raw_data_path_list", len(preprocessed_data_path_list))

In [None]:
# Save labelled windows
labelled_data_base_dir = f"{dataset_dir}/npz_format/labelled/{species}/"

print("Extract sliding windows from preprocessed data (.csv) and save them as .npz files")
for preprocessed_data_path in preprocessed_data_path_list:
    print("-----------------------------------------------------------------------")
    animal_id = os.path.basename(preprocessed_data_path).replace(".csv", "")
    print(animal_id, end=": ")
    
    # extract windows
    (
        X_list, 
        label_id_list, 
        timestamp_list, 
        labelled_flag_list, 
        labelled_X_list, 
        labelled_label_id_list, 
        labelled_timestamp_list, 
        timestamp_gap_idx_list
    ) = extract_sliding_windows(preprocessed_data_path, 
                                sliding_window_size=50, 
                                sliding_window_step_size=25)
    print(f"N of extracted windows: {len(X_list)}")
    print(f"N of labelled windows:  {len(labelled_X_list)}")
    print(f"N of timestamp gaps:    {len(timestamp_gap_idx_list)}")
    
    if len(labelled_X_list) > 0:
        # save labelled data
        npz_file_dir = labelled_data_base_dir + animal_id + "/"
        print("Saving labelled windows as npz ...")
        
        if debug_test_mode == True:
            print(f"| debug mode -> do not save data |")
        else:
            save_labelled_windows_as_npz(animal_id, 
                                         npz_file_dir,
                                         labelled_X_list, 
                                         labelled_label_id_list, 
                                         labelled_timestamp_list)

print(f"----------------------------------------")
print(f"Labelled window extraction completed !")
print(f"----------------------------------------")

### 2b. Extract unlabelled data v2

In [None]:
# species = "omizunagidori"
species = "umineko"

In [None]:
target_path = f"{dataset_dir}/preprocessed_data/{species}/**.csv"
preprocessed_data_path_list = sorted(glob.glob(target_path))
print("input_dir: ")
counter = 0
for preprocessed_data_path in preprocessed_data_path_list:
    print(str(counter).zfill(2), ": ", os.path.basename(preprocessed_data_path))
    counter = counter + 1
print("Length of raw_data_path_list", len(preprocessed_data_path_list))

In [None]:
unlabelled_data_base_dir = f"{dataset_dir}/npz_format/shuffled_20_v2/{species}/"
print("Extract sliding windows from preprocessed data (.csv) and save them as .npz files")

for preprocessed_data_path in preprocessed_data_path_list:
    print("-----------------------------------------------------------------------")
    animal_id = os.path.basename(preprocessed_data_path).replace(".csv", "")
    print(animal_id, end=": ")
    
    # extract windows
    (
        X_list, 
        label_id_list, 
        timestamp_list, 
        labelled_flag_list, 
        labelled_X_list, 
        labelled_label_id_list, 
        labelled_timestamp_list, 
        timestamp_gap_idx_list
    ) = extract_sliding_windows_v2(
        preprocessed_data_path, 
        sliding_window_size=50, 
        sliding_window_step_size=25
    )
    print(f"N of extracted windows: {len(X_list)}")
    print(f"N of labelled windows:  {len(labelled_X_list)}")
    print(f"N of timestamp gaps:    {len(timestamp_gap_idx_list)}")
    
    
    # shuffle extracted windows
    (
        index_list_random, 
        X_list_random, 
        label_id_list_random, 
        timestamp_list_random, 
        labelled_flag_list_random
    ) = get_shuffled_list(X_list, 
                          label_id_list, 
                          timestamp_list, 
                          labelled_flag_list, 
                          random_seed=558)

    # save all data as npz (1 file 20 windows)
    num_windows_per_npz_file = 20
    npz_file_dir = unlabelled_data_base_dir + animal_id + "/"
    print(f"npz_file_dir: {npz_file_dir}")
    print("Saving all windows as npz ...")
    if debug_test_mode == True:
        print(f"| debug mode -> do not save data |")
    else:
        save_blocks_of_windows_as_npz(num_windows_per_npz_file, 
                                      animal_id, 
                                      npz_file_dir,
                                      index_list_random, 
                                      X_list_random, 
                                      label_id_list_random,
                                      timestamp_list_random, 
                                      labelled_flag_list_random)
        
print(f"----------------------------------------")
print(f"Unlabelled window extraction completed !")
print(f"----------------------------------------")