### Install required dependencies

In [None]:
%%capture
!pip install tqdm
!pip install seaborn

### Import dependecies

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import time
import traceback
import re
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

### Initialize File Paths

In [None]:
input_file_path = '/Users/sauravyadav/Documents/Repos/Datasets/BAR_EMA/output_data/data_draft.csv'
output_file_path = '/Users/sauravyadav/Documents/Repos/Datasets/BAR_EMA/output_data/data_draft_v2.csv'

In [None]:
df = pd.read_csv(input_file_path)
df.head(3)

### TO DO
- Fill in Daytime Survey values
- MR_DrnkDur Column
- MR_wake

### 1. Filling Daytime Survey Values

In [None]:
# Create a progress bar using tqdm
progress_bar = tqdm(total=len(df[df['SurvName'] == 'Daytime Surveys']))

# Iterate over rows with SurvName equal to 'Daytime Surveys'
for index, row in df[df['SurvName'] == 'Daytime Surveys'].iterrows():
    date = row['SubDate']
    
    morning_df = df[(df['SubDate'] == date) & (df['SurvName'] == 'Morning Reports')]
    if len(morning_df)==0:
        continue

    # Assuming that in a given day only one morning survey is available. Is this a sound assumption?
    morning_reports_row = morning_df.iloc[0]

    # Fill NaN values in the 'Daytime Surveys' row using corresponding 'Morning Reports' row
    for column in df.columns:
        if pd.isna(row[column]):
            df.loc[index, column] = morning_reports_row[column]

    progress_bar.update(1)

progress_bar.close()

### Normalizing MR_DrnkDur

In [None]:
def normalize_duration(duration_str):
    result = ''
    if not duration_str or duration_str=='CONDITION_SKIPPED' or duration_str=='nan' or pd.isna(duration_str):
        result = 'CONDITION_SKIPPED'
    
    if result != 'CONDITION_SKIPPED':
        duration_str = str(duration_str)
        # Use regular expressions to extract hours and minutes
        hours = 0
        minutes = 0

        # Extract hours
        hours_match = re.search(r'(\d+)\s*hours?|(\d+)\s*hrs?|(^\d+)\s*\d*$', duration_str)
        if hours_match:
            for group in hours_match.groups():
                if group is not None:
                    hours = int(group)
                    break

        # Extract minutes
        minutes_match = re.search(r'(\d+)\s*minutes+|(\d+)\s*min+|\s+(\d+$)', duration_str)
        if minutes_match:
            for group in minutes_match.groups():
                if group is not None:
                    minutes = int(group)
                    break
        result = f'{hours} hours {minutes} minutes'

    # # Calculate the total time in minutes
    # total_minutes = hours * 60 + minutes
    
#     print(f'{duration_str} ----> {result}')

    return f'{result}'

# Apply the normalization function to the DataFrame
df['MR_DrnkDur'] = df['MR_DrnkDur'].apply(normalize_duration)

In [None]:
df.shape

### Plot MR Time Columns

In [None]:

# time_columns = ['MR_wake', 'MR_rise', 'MR_bed']
# plt.figure(figsize=(8, 6))
# hour_labels = [f"{i:02d}:00:00" for i in range(25)]

# # Create a Seaborn color palette for multiple columns
# palette = sns.color_palette("hls", len(time_columns))

# for index, time_column in enumerate(time_columns):
#     # Plot the frequencies for each time column
#     temp_df = df[time_column].value_counts().reset_index().sort_values(by=[time_column]).reset_index(drop=True)
#     sns.barplot(x=temp_df[time_column], y=temp_df['count'], label=f'{time_column} vs. Frequency', color=palette[index])

# plt.xlabel('Time')
# plt.ylabel('Frequency')
# plt.title('Time vs. Frequency for MR time columns')

# # Show a legend to distinguish each line
# plt.legend()

# # Set x-tick positions and labels
# x_ticks = np.arange(0, len(hour_labels))
# plt.xticks(x_ticks, hour_labels, rotation=45)

# # Show the plot
# plt.tight_layout()
# plt.show()


In [None]:
df.to_csv(output_file_path,index=False)