# Raw Data Check
Check the raw data (parsed csv files) and correct recording mistakes.

In [None]:
import glob
from datetime import datetime, timedelta, timezone
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display

import sys
sys.path.append("../") # Set parent directory to sys.path
sys.dont_write_bytecode = True
%load_ext autoreload
%autoreload 2
import src.utils as utils

pd.set_option('display.max_columns', 100)
palette = sns.color_palette(['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7', '#000000']) # Okabe-Ito
sns.set_palette(palette)
sns.set_theme(context='poster', style='ticks', palette=palette, font_scale=1.0)
display(sns.color_palette(palette))

ACC_SAMPLING_RATE = 25

## Load data

### Path to logdata.csv

In [None]:
# Raw data directory
base_dir = "C:/Users/ryoma/D/logbot-data/umineko/Umineko2024"
# base_dir = "../path_to_base_dir" # Change the path appropriately
test_dir = "v5-umineko-2024-playback"

test_id = "LBP00"
# test_id = "LBP01"
# test_id = "LBP02" # no video data
# test_id = "LBP03"
# test_id = "LBP04" # no video data
# test_id = "LBP05"
# test_id = "LBP06"
# test_id = "LBP07"
# test_id = "LBP08"
# test_id = "LBP09"

# print("Select test id: e.g. 0 -> LB00, 1 -> LB01, ..., 12 -> LB12")
# test_id = f"LBP{input().zfill(2)}"

print(f"test_id: {test_id}")

In [None]:
utils.scan_raw_data_dir(base_dir, test_dir, test_id)

### Load the logdata.csv

In [None]:
target_path = f"{base_dir}/{test_dir}/{test_id}*/logdata/logdata.csv"
print(f"target_path:\n{target_path}")
path = glob.glob(target_path)[0]
print(f"csv_file_path:\n{path}")
df = utils.load_logdata_csv_file(path, ACC_SAMPLING_RATE)

## Correct raw data

### Correct camera_count
To save battery consumption, the logger goes to sleep (turned off) during nighttime and restarts in the morning. When the logger restarts, the program initializes all global variables including camera_count. We should correct the camera_count data considering this issue. This issue could be handled by saving camera_count data before the logger goes to sleep mode and loads the data after rebooting; we implemented this feature after the field experiment in 2024. 

In [None]:
# check camera_count data
utils.count_camera_recording_and_playback_sessions(df)

In [None]:
# Correct camera_count
df = utils.correct_camera_count(df, ACC_SAMPLING_RATE, show_head_tail=False, test_id=test_id)

In [None]:
# check camera_count data
utils.count_camera_recording_and_playback_sessions(df)

### Delete unwanted data

There were cases where logging continued after retrieval (recapturing the bird) because the battery still had remaining power.
In such cases:
1. Fetch the retrieval time in JST from the metadata file.
2. Set the time 10 minutes before the retrieval time (JST) as the target time. (subtracting 10 minutes adds an extra safety margin).
3. Convert the target JST time into UTC time
4. Obtain the index corresponding to the target time.
5. Delete all data from the target time index onward.

In [None]:
metadata_path = "../data/metadata/recapture_time.csv"
df_meta = pd.read_csv(metadata_path)
display(df_meta)

In [None]:
idx = df_meta[df_meta['test_id'] == test_id].index
# idx = df_meta[df_meta['test_id'] == "LBP07"].index
recapture_date_jst = df_meta.loc[idx, 'recapture_date_jst'].values[0]
recapture_time_jst = df_meta.loc[idx, 'recapture_time_jst'].values[0]
print(idx)
print(recapture_date_jst)
print(recapture_time_jst)

jst_timezone = timezone(timedelta(hours=+9))  # JST timezone object 
utc_timezone = timezone.utc  # UTC timezone object

# 1. fetch the retrieval time (JST) from the metadata file
recapture_timestamp_str = f"{recapture_date_jst} {recapture_time_jst}"
recapture_timestamp = datetime.strptime(recapture_timestamp_str, '%d/%m/%Y %H:%M:%S')
recapture_timestamp_jst = recapture_timestamp.replace(tzinfo=jst_timezone)  # set JST timezone

# 2. timestamp of 10 minutes before the retrieval time
delta_t_min = 10
timestamp_jst = recapture_timestamp_jst - timedelta(minutes=delta_t_min)

# 3. convert JST to UTC
timestamp_utc = timestamp_jst.astimezone(utc_timezone)

# 4. separated data
year = timestamp_utc.year
month = timestamp_utc.month
day = timestamp_utc.day
hour = timestamp_utc.hour
minute = timestamp_utc.minute
second = timestamp_utc.second

# Print the results
print(f"1. collection datetime (JST): {recapture_timestamp_jst}")
print(f"2. collection datetime - {delta_t_min} min (JST): {timestamp_jst}")
print(f"3. collection datetime - {delta_t_min} min (UTC): {timestamp_utc}")
print(f"4. UTC target time:")
print("   - Year:", year)
print("   - Month:", month)
print("   - Day:", day)
print("   - Hour:", hour)
print("   - Minute:", minute)
print("   - Second:", second)



In [None]:
unwanted_first_index = df[
    (df['rtc_year'] == year) & (df['rtc_month'] == month) & (df['rtc_day'] == day) &
    (df['rtc_hour'] == hour) & (df['rtc_min'] == minute) & (df['rtc_sec'] == second) &
    (df['rtc_msec'] == 0)
].index
print(len(unwanted_first_index))
print(unwanted_first_index)

if len(unwanted_first_index) > 0:
    print(unwanted_first_index[0])
    df = df[:unwanted_first_index[0]] # remove unwanted data

## Correct duplicated RTC timestamps

There are cases where the RTC time overlapped.  
Data with the exact same timestamp appeared in the csv file.  
(e.g., measurement or logging error at timing transitions, such as when the minute changes.)  
Here, we handle such cases.

In [None]:
# A UTC, JST, unixtime columns to dataframe
df_time = utils.rtc_data_time_to_timestamp_and_unixtime(df)
display(df_time.head(3))
df_time['rtc_year'].isna().sum()

# First, check how often RTC time duplication occurred during video recording
duplicated_list, duplicated_index_list = utils.check_duplicated_timestamp(df_time)
df_time_cam_rec = df_time[ (df_time['camera_command'] == 1) | (df_time['camera_recording'] == 1)]
duplicated_list2, duplicated_index_list2 = utils.check_duplicated_timestamp(df_time_cam_rec)
if len(duplicated_index_list2) > 0: 
    print(duplicated_index_list2)

# Exception handler
# If RTC time duplication occurred during video recording, 
# -> correct the data using the below exception handler
if np.sum(duplicated_list2) > 0:
    print("Exception handler")
    # print(duplicated_list)
    if test_id == "LBP03":
        # search UTC 19:54:00
        indices = df_time.query('rtc_hour == 19 and rtc_min == 54 and rtc_sec == 0').index
        print(indices)
        print(len(indices))
        for index in indices[25:]:
            df_time.loc[index, 'rtc_min'] = 55
        print(f"test_id: {test_id} exception corrected")

        # UTC 19:55:42
        indices = df_time.query('rtc_hour == 19 and rtc_min == 55 and rtc_sec == 42').index
        correct_start_idx = indices[25]

        indices = df_time.query('rtc_hour == 19 and rtc_min == 56 and rtc_sec == 54').index
        correct_end_idx = indices[-1] + 1 # add 1 to the last data index of LBP03_S00

        replace_start_index = correct_start_idx + ACC_SAMPLING_RATE
        replace_end_index = correct_end_idx + ACC_SAMPLING_RATE

        for col in ['rtc_year', 'rtc_month', 'rtc_day', 'rtc_hour', 'rtc_min', 'rtc_sec','rtc_msec',]:
            df_time.loc[correct_start_idx:correct_end_idx, col] = df_time.loc[replace_start_index:replace_end_index, col].values

    # Update the timestamp again, and obtain duplicated_index_list for check the results
    df_time = utils.rtc_data_time_to_timestamp_and_unixtime(df_time)
    duplicated_list, duplicated_index_list = utils.check_duplicated_timestamp(df_time)

# RTC time duplication outside of video recording is not relevant to data analysis.
# Therefore, simply drop the second occurrence of the duplicate.
print("Simply drop duplicated rows")
df = utils.drop_duplicated_timestamp(df_time, duplicated_index_list)

In [None]:
duplicated_list, duplicated_index_list = utils.check_duplicated_timestamp(df)

## Save the corrected df

In [None]:
save_dir = "../data/corrected-raw-data"
save_path = f"{save_dir}/{test_id}.csv"
print(save_path)
# df.to_csv(save_path, index=False)

# Corrected Data Check

In [None]:
input_dir = "../data/corrected-raw-data"

test_id = "LBP00"
# test_id = "LBP01"
# test_id = "LBP02" # no video data
# test_id = "LBP03"
# test_id = "LBP04" # no video data
# test_id = "LBP05"
# test_id = "LBP06"
# test_id = "LBP07"
# test_id = "LBP08"
# test_id = "LBP09"

input_path = f"{save_dir}/{test_id}.csv"
df = pd.read_csv(input_path)

In [None]:
display(df.head(5))
display(df.tail(5))

## Battery levels

In [None]:
print(len(df))

df_battery = df[ (df["rtc_sec"] == 0) & (df["rtc_msec"] == 0) ]
print(len(df_battery))

In [None]:
fig = utils.plot_battery_level(df)

## SD write time delay check 

In [None]:
# SD write delay 
# fig = utils.plot_sd_write_time(df, "boxplot")
fig = utils.plot_sd_write_time(df, "scatter")
utils.sd_write_delay_check(df)

## RTC and GPS time check
Check the discrepancy between RTC time and time based on GPS data.

In [None]:
# check gap between rtc time and gps time
fig = utils.plot_rtc_and_gps_data_gap(df)