In [None]:
import numpy as np
import os
import pandas as pd
import csv
from tqdm import tqdm
import time
import matplotlib.pyplot as plt

In [None]:
# directory to the Sekisui Box
sekisui_dir = r'C:\Users\$YOURUSERNAME$\Box\MIT - Sekisui Collaborative Research'

# directory to the matched output data
data_dir = r'C:\$OUTPUT_DATA_DIR$'

# BIOPAC

## Get comprehensive Biopac paths DFS

In [None]:
def get_all_BIOPAC():
    
    df = pd.DataFrame(columns=['case_id', 'case_dir', 'table_id', 'subject_id', 'year', 'month', 'date', 'file_name'])

    for case_id in ['Case1', 'Case2']:
        case_dir = case_id + '\\from SH to MIT\\Biopac'
        bp_case_dir = os.path.join(sekisui_dir, case_dir)
        table_ids = os.listdir(bp_case_dir)
        # print(table_ids)
        for t_id in table_ids:
            t_dir = os.path.join(bp_case_dir, t_id)
            subject_ids = os.listdir(t_dir)
            # print(subject_ids)
            for s_id in subject_ids:
                s_dir = os.path.join(t_dir, s_id)
                year_ids = os.listdir(s_dir)
                # print(year_ids)
                for y_id in year_ids:
                    y_dir = os.path.join(s_dir, y_id)
                    month_ids = os.listdir(y_dir)
                    # print(month_ids)
                    for m_id in month_ids:
                        # print('month', m_id)
                        m_dir = os.path.join(y_dir, m_id)
                        date_ids = os.listdir(m_dir)
                        # print(date_ids)
                        for d_id in date_ids:
                            bp_dir = os.path.join(m_dir, d_id)
                            bp_file_names = os.listdir(bp_dir)
                            # print(bp_file_names)
                            for bp_name in bp_file_names:
                                if '.csv' in bp_name:
                                    df_cat = pd.DataFrame({'case_id': case_id,
                                                           'case_dir': case_dir,
                                                            'table_id': t_id, 
                                                            'subject_id': s_id, 
                                                            'year': y_id, 
                                                            'month': m_id, 
                                                            'date': d_id, 
                                                            'file_name': bp_name
                                                            }, index=[0])
                                    df = pd.concat([df, df_cat], ignore_index=True)
    return df


In [None]:
df = get_all_BIOPAC()

In [None]:
df.to_csv('biopac_search.csv', index=False)

## Process BIOPAC

In [None]:
def parse_time(time):
    ss = time[-2:]
    mm = time[-4:-2]
    hh = time[-6:-4]
    return hh, mm, ss

In [None]:
def split_biopac_name(biopac_name):
    biopac_name = biopac_name.replace("(","")
    biopac_name = biopac_name.replace(")","")
    biopac_name = biopac_name.replace(".csv","")
    biopac_name = biopac_name.split('_')
    return biopac_name

def parse_biopac_name(biopac_name):
    hh,mm,ss = parse_time(biopac_name[-1])
    return hh, mm, ss

In [None]:
def get_action_id(x):
    for i in x:
        if i[:3].lower() == 'act':
            return i[3:]

def get_action(x):
    for i in range(len(x)):
        if x[i][:3].lower() != 'act':
            action = ''
            for j in range(i, len(x)):
                action += x[j] + '_'
            return action[:-1]

In [None]:
biopac_df = pd.read_csv('biopac_search.csv')
biopac_df.head()

In [None]:
# get split string and length
biopac_df['name_split'] = biopac_df['file_name'].apply(split_biopac_name)

# get house
biopac_df['house'] = biopac_df['subject_id'].apply(lambda x: x[:7])

# get room
biopac_df['room'] = biopac_df['name_split'].apply(lambda x: x[0].upper())
# replace DINNING to DINING
biopac_df['room'] = biopac_df['room'].apply(lambda x: 'DINING' if x == 'DINNING' else x)

# get time
biopac_df['hour'], biopac_df['minute'], biopac_df['second'] = zip(*biopac_df['name_split'].apply(lambda x: parse_time(x[-1])))


biopac_df['name_split'] = biopac_df['name_split'].apply(lambda x: x[1:-4])
biopac_df['name_split'] = biopac_df['name_split'].apply(lambda x: [i for i in x if i not in ['Biopac', 'biopac', 'Case1', 'Case2']])

# get dp_id
biopac_df['dp_id'] = biopac_df['name_split'].apply(lambda x: x[1])

biopac_df['name_split'] = biopac_df['name_split'].apply(lambda x: x[2:])

# get action
biopac_df['act_id'] = biopac_df['name_split'].apply(lambda x: get_action_id(x))
biopac_df['action'] = biopac_df['name_split'].apply(lambda x: get_action(x))

# drop name_split
biopac_df = biopac_df.drop(columns=['name_split'])

In [None]:
biopac_df.to_csv('biopac_search.csv', index=False)

## Find time period

In [None]:
def make_dates_normal(df):
    df['year'] = df['year'].astype(str)
    df['month'] = df['month'].astype(str)
    df['date'] = df['date'].astype(str)

    df['month'] = df['month'].apply(lambda x: '0' + x if len(x) == 1 else x)
    df['date'] = df['date'].apply(lambda x: '0' + x if len(x) == 1 else x)
    return df

In [None]:
biopac_df = pd.read_csv('biopac_search.csv')
biopac_df = make_dates_normal(biopac_df)

In [None]:
room_map = {
    'BED_pfh_016': 1,
    'BED_pfh_028': 2,
    'BED_pfh_031': 3,
    'STUDY_pfh_031': 4,
    'BED1_pfh_101': 5,
    'BED2_pfh_101': 6,
    'DINING_pfh_101': 7
}

In [None]:
biopac_df['room_id'] = None
for i in range(len(biopac_df)):
    biopac_df.loc[i, 'room_id'] = room_map[biopac_df.loc[i, 'room']+'_'+biopac_df.loc[i, 'house']]

In [None]:
def get_number_from_csv(file_path):
    with open(file_path, 'r') as csv_file:
        reader = csv.reader(csv_file)
        for _ in range(7):  # Skip the first 8 lines
            next(reader)
        for _ in range(7):
            line = next(reader)  # Retrieve the 9th line
            if line[0] in ['sec', 'min']:
                line = next(reader)
                return int(line[0])

def get_biopac_path(df, row):
    case_id = df.loc[row, 'case_id']
    table_id = df.loc[row, 'table_id']
    subject_id = df.loc[row, 'subject_id']
    year = df.loc[row, 'year']
    month = df.loc[row, 'month']
    date = int(df.loc[row, 'date'])
    file_name = df.loc[row, 'file_name']
    biopac_path = os.path.join(sekisui_dir, case_id,'from SH to MIT\\Biopac', table_id, subject_id, str(year), str(month), str(date), file_name)
    return biopac_path

In [None]:
def get_rows_to_skip(file_path):
    """
    This gives us a way to open the biopac files in a consistent manner
    """
    with open(file_path, 'r') as csv_file:
        reader = csv.reader(csv_file)
        for _ in range(7):  # Skip the first 8 lines
            next(reader)
        for i in range(7,15):
            line = next(reader)  # Retrieve the 9th line
            if line[0] in ['sec', 'min']:
                return i

In [None]:
duration = []

for i in tqdm(range(len(biopac_df))):
    while True:
        try:
            csv_path = get_biopac_path(biopac_df, i)
            number = get_number_from_csv(csv_path)
            if number == 0:
                bp_skip = get_rows_to_skip(csv_path)
                bp_df = pd.read_csv(csv_path, skiprows=bp_skip)
                # drop first row of bp_1
                bp_df = bp_df.drop([0])
                number = len(bp_df)
            duration.append(number)
            break
        except OSError as e:
            if e.errno != 22:
                raise
            time.sleep(1)

In [None]:
biopac_df['duration'] = duration

In [None]:
biopac_df.rename(columns={'hour': 'start_hr',
                          'minute':'start_min',
                          'second': 'start_sec'}, inplace=True)

In [None]:
biopac_df['start_time_idx'] = biopac_df['start_hr'].astype(int)*3600 + biopac_df['start_min'].astype(int)*60 + biopac_df['start_sec'].astype(int)
biopac_df['end_time_idx'] = biopac_df['start_time_idx'] + biopac_df['duration'].astype(int)/250
biopac_df['end_time_idx'] = biopac_df['end_time_idx'].astype(int)

In [None]:
biopac_df.to_csv('biopac_search.csv', index=False)

# Match doppler

In [None]:
biopac_df = pd.read_csv('biopac_search.csv')
biopac_df = make_dates_normal(biopac_df)

In [None]:
doppler_room = pd.read_csv('doppler_room.csv')
doppler_map = {i: doppler_room[doppler_room['RoomID']==i]['RadarID'].unique().tolist() for i in range(5,8)}

In [None]:
def get_dp_folder(df, row, dp_id):
    case_id = df.loc[row, 'case_id']
    table_id = df.loc[row, 'table_id']
    year = df.loc[row, 'year']
    month = df.loc[row, 'month']
    date = df.loc[row, 'date']
    dp_folder = os.path.join(sekisui_dir, case_id,'from SH to MIT\\Doppler', table_id)
    dp_folder = os.path.join(dp_folder, dp_id)
    dp_folder = os.path.join(dp_folder, str(year), str(month), str(date))
    return dp_folder

In [None]:
def parse_doppler_name(name):
    name = name.replace('.csv','')
    name = name.split('_')
    time = name[-3]
    hh = time[:2]
    mm = time[2:4]
    ss = time[4:]
    start_time_idx = int(hh)*3600 + int(mm)*60 + int(ss)
    duration = int(name[-1])
    end_time_idx = start_time_idx + duration
    return start_time_idx, end_time_idx, duration

In [None]:
master_cols = ['biopac_idx', 'case_id', 'table_id', 'subject_id', 'year', 'month', 'date', 
               'room', 'act_id', 'action', 'house', 'room_id', 'doppler_id',
               'biopac_start_time_idx', 'biopac_end_time_idx', 'doppler_start_time_idx', 'doppler_end_time_idx', 
               'biopac_file_name', 'doppler_file_name',
               'biopac_start_idx', 'biopac_end_idx', 'doppler_start_idx', 'doppler_end_idx', 'overlap_duration']

master_df = pd.DataFrame(columns=master_cols)

In [None]:
for i in tqdm(range(len(biopac_df))):

    # get overlapping information
    biopac_idx = i
    case_id = biopac_df.loc[i, 'case_id']
    table_id = biopac_df.loc[i, 'table_id']
    subject_id = biopac_df.loc[i, 'subject_id']
    year = biopac_df.loc[i, 'year']
    month = biopac_df.loc[i, 'month']
    date = biopac_df.loc[i, 'date']
    room = biopac_df.loc[i, 'room']
    act_id = biopac_df.loc[i, 'act_id']
    action = biopac_df.loc[i, 'action']
    house = biopac_df.loc[i, 'house']
    room_id = biopac_df.loc[i, 'room_id']

    # get biopac information
    biopac_start_time_idx = biopac_df.loc[i, 'start_time_idx']
    biopac_end_time_idx = biopac_df.loc[i, 'end_time_idx']
    biopac_duration = biopac_df.loc[i, 'duration']
    biopac_file_name = biopac_df.loc[i, 'file_name']

    # Get doppler ID of all the radars
    if biopac_df.loc[i, 'dp_id'] == 'DS-xxxxx':
        dp_list = doppler_map[biopac_df.loc[i, 'room_id']]
    else:
        dp_list = [biopac_df.loc[i, 'dp_id']]
    
    # get information for each doppler
    for doppler_id in dp_list:
        dp_folder = get_dp_folder(biopac_df, i, doppler_id)

        if not os.path.exists(dp_folder):
            continue

        dp_files = []
        while True:
            try:
                dp_files = os.listdir(dp_folder)
                break  # Break out of the loop if the operation succeeds
            except OSError as e:
                if e.winerror == 1006:
                    time.sleep(1)  # Wait for a second before retrying
                else:
                    raise

        temp_df = pd.DataFrame(columns=master_cols)

        for dp_file in dp_files:
            if '.csv' not in dp_file:
                continue

            dp_file_name = dp_file

            # get overlapping times
            dp_start_time_idx, dp_end_time_idx, dp_duration = parse_doppler_name(dp_file)

            # 1. doppler starts, biopac starts, doppler ends, biopac ends
            if dp_start_time_idx <= biopac_start_time_idx <= dp_end_time_idx <= biopac_end_time_idx:
                dp_start_idx = (biopac_start_time_idx - dp_start_time_idx)*1000
                biopac_start_idx = 0
                overlap_duration = dp_end_time_idx - biopac_start_time_idx

            # 2. doppler starts, biopac starts, biopac ends, doppler ends
            elif dp_start_time_idx <= biopac_start_time_idx <= biopac_end_time_idx <= dp_end_time_idx:
                dp_start_idx = (biopac_start_time_idx - dp_start_time_idx)*1000
                biopac_start_idx = 0
                overlap_duration = biopac_end_time_idx - biopac_start_time_idx

            # 3. biopac starts, doppler starts, biopac ends, doppler ends
            elif biopac_start_time_idx <= dp_start_time_idx <= biopac_end_time_idx <= dp_end_time_idx:
                biopac_start_idx = (dp_start_time_idx - biopac_start_time_idx)*250
                dp_start_idx = 0
                overlap_duration = biopac_end_time_idx - dp_start_time_idx

            # 4. biopac starts, doppler starts, doppler ends, biopac ends
            elif biopac_start_time_idx <= dp_start_time_idx <= dp_end_time_idx <= biopac_end_time_idx:
                biopac_start_idx = (dp_start_time_idx - biopac_start_time_idx)*250
                dp_start_idx = 0
                overlap_duration = dp_end_time_idx - dp_start_time_idx

            # 5. if none of the above, then no overlap
            else:
                continue

            dp_end_idx = dp_start_idx + overlap_duration*1000
            biopac_end_idx = biopac_start_idx + overlap_duration*250

            # add to temp_df
            temp_df.loc[len(temp_df.index)] = [biopac_idx, case_id, table_id, subject_id,
                                            year, month, date,
                                            room, act_id, action, house, room_id, doppler_id,
                                            biopac_start_time_idx, biopac_end_time_idx,
                                            dp_start_time_idx, dp_end_time_idx,
                                            biopac_file_name, dp_file_name,
                                            biopac_start_idx, biopac_end_idx,
                                            dp_start_idx, dp_end_idx,
                                            overlap_duration]
        
        # add to master_df
        master_df = pd.concat([master_df, temp_df], ignore_index=True)
        master_df.to_csv('master_df.csv', index=False)


In [None]:
# sort by biopac_idx
master_df = master_df.sort_values(by=['biopac_idx']).reset_index(drop=True)

# remove duplicates from master_df because it stopped half way several times
master_df = master_df.drop_duplicates().reset_index(drop=True)

# add overlap_start_idx
master_df['overlap_start_idx'] = master_df['biopac_start_time_idx'] + master_df['biopac_start_idx'] // 250
master_df['overlap_end_idx'] = master_df['biopac_start_time_idx'] + master_df['biopac_end_idx'] // 250

master_df.to_csv('master_df.csv', index=False)

## Separate into clips

In [None]:
master_df = pd.read_csv('master_df.csv')
master_df = make_dates_normal(master_df)

In [None]:
def get_doppler_path(df, row):
    case_id = df.loc[row, 'case_id']
    table_id = df.loc[row, 'table_id']
    year = df.loc[row, 'year']
    month = df.loc[row, 'month']
    date = df.loc[row, 'date']
    dp_id = df.loc[row, 'doppler_id']
    file_name = df.loc[row, 'doppler_file_name']
    doppler_path = os.path.join(sekisui_dir, case_id,'from SH to MIT\\Doppler', table_id, dp_id, str(year), str(month), str(date), file_name)
    return doppler_path

def get_biopac_path(df, row):
    case_id = df.loc[row, 'case_id']
    table_id = df.loc[row, 'table_id']
    subject_id = df.loc[row, 'subject_id']
    year = df.loc[row, 'year']
    month = df.loc[row, 'month']
    date = int(df.loc[row, 'date'])
    file_name = df.loc[row, 'biopac_file_name']
    biopac_path = os.path.join(sekisui_dir, case_id,'from SH to MIT\\Biopac', table_id, subject_id, str(year), str(month), str(date), file_name)
    return biopac_path

In [None]:
# Total sample points

clip_sec = 30

total_num = (master_df['overlap_duration'][master_df['overlap_duration'] > clip_sec] // clip_sec).sum()
print(f'Total number of data points in 30 second clips: {total_num}')

In [None]:
biopac_num = max(master_df['biopac_idx']) + 1

indexed_df = pd.DataFrame(columns=['datapoint_idx','biopac_clip_idx', 'biopac_idx', 'case_id', 'table_id', 'subject_id', 'year', 'month',
                                    'date', 'room', 'act_id', 'action', 'house', 'room_id', 'doppler_id',
                                    'biopac_start_time_idx', 'biopac_end_time_idx',
                                    'doppler_start_time_idx', 'doppler_end_time_idx', 'biopac_file_name',
                                    'doppler_file_name', 'biopac_start_idx', 'biopac_end_idx',
                                    'doppler_start_idx', 'doppler_end_idx', 'overlap_duration',
                                    'overlap_start_idx', 'overlap_end_idx'])

Let's just not do anything with the csv, let's just index everything first

In [None]:
biopac_clip_idx = 0
datapoint_idx = 0

# for each unique biopac file (entire file not clipped)
for biopac_i in tqdm(range(biopac_num)):

    # get the dopplers corresponding to biopac_i
    small_df = master_df[(master_df['biopac_idx'] == biopac_i) & (master_df['overlap_duration'] >= clip_sec)].copy().reset_index(drop=True)
    
    if len(small_df) == 0:
        continue

    overlap_start_idx = min(small_df['overlap_start_idx'])
    overlap_end_idx = max(small_df['overlap_end_idx'])

    biopac_start_time_idx = small_df.iloc[0]['biopac_start_time_idx']

    overlap_num = (overlap_end_idx - overlap_start_idx) // clip_sec

    # check how many clips are in each overlap
    for overlap_i in range(overlap_num):

        # for each row in small df, i.e. each doppler
        for j in range(len(small_df)):

            # check if overlap_i is in the doppler's overlap
            if small_df.iloc[j]['overlap_start_idx'] > overlap_start_idx + overlap_i*clip_sec:
                continue
            if small_df.iloc[j]['overlap_end_idx'] < overlap_start_idx + (overlap_i+1)*clip_sec:
                continue

            dp_start_time_idx = small_df.loc[j, 'doppler_start_time_idx']

            indexed_df.loc[datapoint_idx] = [datapoint_idx, biopac_clip_idx] + small_df.iloc[j].tolist()
            
            indexed_df.loc[datapoint_idx, 'overlap_start_idx'] = overlap_start_idx + overlap_i * clip_sec
            indexed_df.loc[datapoint_idx, 'overlap_end_idx'] = overlap_start_idx + (overlap_i+1) * clip_sec
            
            indexed_df.loc[datapoint_idx, 'biopac_start_idx'] = (overlap_start_idx + overlap_i * clip_sec - biopac_start_time_idx)*250
            indexed_df.loc[datapoint_idx, 'biopac_end_idx'] = (overlap_start_idx + (overlap_i+1) * clip_sec - biopac_start_time_idx)*250
            indexed_df.loc[datapoint_idx, 'doppler_start_idx'] = (overlap_start_idx + overlap_i * clip_sec - dp_start_time_idx)*1000
            indexed_df.loc[datapoint_idx, 'doppler_end_idx'] = (overlap_start_idx + (overlap_i+1) * clip_sec - dp_start_time_idx)*1000

            datapoint_idx += 1

        biopac_clip_idx += 1

In [None]:
indexed_df.to_csv('indexed_df.csv', index=False)

In [None]:
biopac_list = indexed_df['biopac_idx'].unique().tolist()

In [None]:
for biopac_i in tqdm(biopac_list, desc='Biopac Files', total=len(biopac_list)):

    # get a full biopac file
    small_df = indexed_df[indexed_df['biopac_idx'] == biopac_i].copy().reset_index(drop=True)
    bp_path = get_biopac_path(small_df, 0)
    bp_skip = get_rows_to_skip(bp_path)
    bp_df = pd.read_csv(bp_path, skiprows=bp_skip)
    # drop first row of bp_1
    bp_df = bp_df.drop([0])

    dp_list = small_df['doppler_file_name'].unique().tolist()
    
    for dp_i in dp_list:

        smaller_df = small_df[small_df['doppler_file_name'] == dp_i].copy().reset_index(drop=True)

        # get doppler data
        dp_path = get_doppler_path(smaller_df, 0)
        dp_df = pd.read_csv(dp_path, usecols=[0, 1])

        for i in range(len(smaller_df)):
            # get the starting point of the clip
            bp_start = smaller_df.loc[i, 'biopac_start_idx']
            dp_start = smaller_df.loc[i, 'doppler_start_idx']

            # get the ending point of the clip
            bp_end = smaller_df.loc[i, 'biopac_end_idx']
            dp_end = smaller_df.loc[i, 'doppler_end_idx']

            # get the clip
            bp_clip = bp_df.iloc[bp_start:bp_end, :]
            dp_clip = dp_df.iloc[dp_start:dp_end, :]

            # name the clip
            biopac_clip_i = smaller_df.loc[i, 'biopac_clip_idx']
            biopac_file_name = 'BIOPAC_'+str(biopac_clip_i)
            doppler_file_name = str(smaller_df.loc[i, 'doppler_id'])+str(biopac_clip_i)

            # save the clip
            datapoint_dir = os.path.join(data_dir, str(biopac_clip_i))
            if not os.path.exists(datapoint_dir):
                os.makedirs(datapoint_dir, exist_ok=True)
                bp_clip.to_csv(os.path.join(datapoint_dir, f'{biopac_file_name}.csv'), index=False)

            dp_clip.to_csv(os.path.join(datapoint_dir, f'{doppler_file_name}.csv'), index=False)
