In [None]:
import pandas as pd
import numpy as np
import pathlib as pl
import datetime as dt
from functools import reduce #needed to allow merging of multiple datasets      

home = pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\data')
p1 = pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\data\Participant 1_2022-10-27_09.07.37')
p2 = pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\data\Partcipant 59_2022-11-21_10.37.29')
time_stamps = pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\data\Timesheets.csv')

In [None]:
def import_finometer_intervals_feb_23(folder_path, interval='1T', save_csv=False):
    '''This function imports the finometer data and then calculates the average of each measure over the selected time period'''

    if not isinstance(folder_path, pl.Path): # check if folder_path is a pathlib.Path object
        raise TypeError('folder_path must be a pathlib.Path object')
    elif not folder_path.exists() or not folder_path.is_dir(): # check if folder_path exists and is a directory
        raise ValueError('folder_path does not exist or is not a directory')
    else:
        # find the .txt file in the folder
        files = [file for file in folder_path.glob('*.txt')]
        if len(files) != 1:
            raise ValueError(f'Expected one .txt file, but found {len(files)} in the folder')
        file = files[0]

        # read in the data from the .txt file
        df = pd.read_csv(
            file,
            sep=';',
            header=0,
            skiprows=8,
            skipfooter=1,
            engine='python',
        )

        # drop the unnamed column at position 13
        df = df.drop(df.columns[13], axis=1)

        # set the 'Time (s)' column to a datetime object
        df['Time (s)'] = pd.to_datetime(df['Time (s)'], format='%H:%M:%S.%f')

        # set the 'Time (s)' column as the index and resample the DataFrame
        df = df.set_index('Time (s)').resample(interval).mean()

        # format the DatetimeIndex to only show hours and minutes
        # df.index = df.index.strftime('%H:%M')

        if save_csv:
            df.to_csv(folder_path / f'imported data for {file.stem.split("_")[0]}.csv', index=True)
            print(f'CSV saved for {file.stem.split("_")[0]}')

        return df


In [None]:
[import_finometer_intervals_feb_23(i, save_csv=True) for i in home.iterdir() if i.is_dir()]

In [None]:
def import_protocol_times(times_file_path, add_seconds=False, flatten_seconds=False, save_csv=False):
    '''This function imports the protocol times from the .csv file and returns a dataframe with the protocol times for each participant
    
    file_path: pathlib.Path object
        The path to the .csv file containing the protocol times
    add_seconds: boolean (optional)
        If True, seconds will be added to the time values (if missing)
    save_csv: boolean (optional)
        If True, the imported data will be saved as a .csv file in the same folder as the .csv file, this is not always needed and should be used sparingly
    flatten_seconds: boolean (optional)
        If True, seconds will be set to 00 for all time values
    '''

    def add_seconds_to_time(time_str):
        '''This function adds seconds to the time string for in case the time string is missing seconds'''
        if len(time_str) == 5:
            time_str += ":00"
        return time_str

    def flatten_seconds(time_str):
        '''This function sets seconds to 00 for a given time string'''
        return time_str[:5] + ':00'

    if not isinstance(times_file_path, pl.Path):#check if folder_path is a pathlib.Path object
        raise TypeError('file_path must be a pathlib.Path object')
    elif not times_file_path.exists(): #  and if it exists
        raise ValueError('file_path does not exist')
    elif not times_file_path.is_file(): #  and is a file 
        raise ValueError('file_path is not a file')
    elif times_file_path.suffix != '.csv': #  and is a csv file
        raise ValueError('file_path is not an csv file')
    else:
        df = pd.read_csv(times_file_path, delimiter= ',')
        df.columns = [col.strip() for col in df.columns]
        cols_to_keep = ['Participant ID', 'Start of Baseline', 'End of Baseline', 'Start of Task 1', 'End of Task 1', 'Start of Recovery Period', 'End of Recovery Period']
        df = df[cols_to_keep].applymap(lambda x: str(x).strip('"') if isinstance(x, str) else x)
        
        if add_seconds and flatten_seconds:
            raise ValueError('Only one of add_seconds and flatten_seconds can be True')

        if add_seconds:
            try:
                df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: add_seconds_to_time(x) if isinstance(x, str) else x)
                df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: pd.to_datetime(x, format='%H:%M:%S', errors='coerce'))
            except:
                print('Could not add seconds to time, please check the time format')

        elif flatten_seconds:
            try:
                df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: add_seconds_to_time(x) if isinstance(x, str) else x)
                df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: flatten_seconds(x) if isinstance(x, str) else x)
                df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: pd.to_datetime(x, format='%H:%M:%S', errors='coerce'))
            except:
                print('Could not set seconds to 00, please check the time format')
        
        if save_csv: #if you want to save the csv file (which may be useful if you want to use the data in other ways)
            try:
                df.to_csv(times_file_path.parent / f"cleaned times.csv", index=False)
                print(f"CSV saved for {times_file_path.stem}")
            except Exception as e:
                print(f"Could not save csv file, error: {e}")
            


        return df

In [None]:
y = import_protocol_times(
    time_stamps,
    flatten_seconds = True,
    save_csv=True
    )

In [41]:
def import_protocol_averages_feb_23(file_path, t1=None, t2=None, t3=None, t4=None, t5=None, t6=None, save_csv=False):
    '''A function that imports the averaged finomter files (which have already been processed from the raw data)
    to produce averages for each section of the experiment protocol'''

    if not isinstance(file_path, pl.Path):
        raise TypeError('file_path must be a pathlib.Path object')
    elif not file_path.exists():
        raise ValueError('file_path does not exist')
    else:
        # check if the file is a csv or xlsx file
        if file_path.suffix == '.csv':
            try:
                intervals = pd.read_csv(file_path, index_col=0, parse_dates=True)
            except pd.errors.EmptyDataError:
                print(f"File {file_path} is empty")
        elif file_path.suffix == '.xlsx':
            try:
                intervals = pd.read_excel(file_path, index_col=0, parse_dates=True)
            except pd.errors.EmptyDataError:
                print(f"File {file_path} is empty")
        else:
            raise ValueError("File must be a Path object for a .csv or .xlsx file")

    intervals.index = pd.to_datetime(intervals.index).strftime('%H:%M:%S')
    intervals = intervals.replace('--', np.nan)
    intervals = intervals.dropna(how='all')

    def create_protocol_part_df(frame, start, end, tag, filename):
        '''A function that creates a dataframe representing a time period within a particular experimental protocol from a given DataFrame.
        Takes a DataFrame, start and end times for the portion of the experimental protocol.
        Returns a DataFrame with the mean values of the given columns during that portion of the study.'''
        period = frame.loc[start:end].dropna().T
        period['Average'] = period.mean(axis=1)
        b = pd.DataFrame(period['Average']).T
        b.columns = [f'{i}_{tag}' for i in b.columns]
        b['Participant ID'] = f'{filename}'
        return b

    # creating the baseline dataframe
    baseline = create_protocol_part_df(intervals, t1 , t2, 'bl', file_path.parent.stem.split('_')[0])

    # creating the task dataframe
    task = create_protocol_part_df(intervals, t3, t4, 'task', file_path.parent.stem.split('_')[0])

    # creating the recovery dataframe
    recovery = create_protocol_part_df(intervals, t5, t6, 'rec', file_path.parent.stem.split('_')[0])

    # creating a dataframe from the smaller dataframes
    data_merge = reduce(lambda left, right: pd.merge(left , right, on = ["Participant ID"], how = "outer"), [baseline, task, recovery])
    data_merge.set_index('Participant ID', inplace=True)

    if save_csv:
        data_merge.to_csv(file_path.parent / f"{file_path.parent.stem.split('_')[0]} protocol_averages.csv")
        print(f"Saved {file_path.parent.stem.split('_')[0]} protocol averages.csv to {file_path.parent}")
    
    return data_merge

In [None]:

for n in y.index:
    if y.iloc[n, 1:].isnull().values.any():
        print(f"Skipping participant {y.loc[n, 'Participant ID']} due to missing timestamps")
        continue
    participant_id = y.loc[n, 'Participant ID']
    file_name = f'imported data for {participant_id}.csv'
    file_paths = list(home.glob(f'**/{file_name}'))
    if not file_paths:
        print(f'no file found for {participant_id}')
    else:
        full_path = file_paths[0].resolve()
        print(f'File {full_path} found')

        start_bl = y.loc[n, 'Start of Baseline']
        end_bl = y.loc[n, 'End of Baseline']

        start_task = y.loc[n, 'Start of Task 1']
        end_task = y.loc[n, 'End of Task 1']

        start_rec = y.loc[n, 'Start of Recovery Period']
        end_rec = y.loc[n, 'End of Recovery Period']

        import_path = f'imported data for {participant_id}.csv'

        import_protocol_averages_feb_23(full_path,
                                        t1=start_bl.strftime('%H:%M:%S'),
                                        t2=end_bl.strftime('%H:%M:%S'),
                                        t3=start_task.strftime('%H:%M:%S') if not pd.isna(start_task) else None,
                                        t4=end_task.strftime('%H:%M:%S') if not pd.isna(end_task) else None,
                                        t5=start_rec.strftime('%H:%M:%S') if not pd.isna(start_rec) else None,
                                        t6=end_rec.strftime('%H:%M:%S') if not pd.isna(end_rec) else None,
                                        save_csv=True)


In [None]:
def make_protocol_rows(data_path, timestamps_df):
    """
    Import data for participants whose timestamps are all non-null.
    
    Parameters:
    - data_path: pl.Path or str, the path to the folder containing the data files
    - timestamps_df: pandas DataFrame, containing the timestamps for each participant
    
    Returns:
    - None
    """
    data = pl.Path(data_path)
    for n in timestamps_df.index:
        if timestamps_df.iloc[n, 1:].isnull().values.any():
            print(f"Skipping participant {timestamps_df.loc[n, 'Participant ID']} due to missing timestamps")
            continue
        participant_id = timestamps_df.loc[n, 'Participant ID']
        file_name = f'imported data for {participant_id}.csv'
        file_paths = list(data.glob(f'**/{file_name}'))
        if not file_paths:
            print(f'no file found for {participant_id}')
        else:
            full_path = file_paths[0].resolve()
            print(f'File {full_path} found')

            start_bl = timestamps_df.loc[n, 'Start of Baseline']
            end_bl = timestamps_df.loc[n, 'End of Baseline']

            start_task = timestamps_df.loc[n, 'Start of Task 1']
            end_task = timestamps_df.loc[n, 'End of Task 1']

            start_rec = timestamps_df.loc[n, 'Start of Recovery Period']
            end_rec = timestamps_df.loc[n, 'End of Recovery Period']

            import_path = f'imported data for {participant_id}.csv'

            import_protocol_averages_feb_23(full_path,
                                            t1=start_bl.strftime('%H:%M:%S'),
                                            t2=end_bl.strftime('%H:%M:%S'),
                                            t3=start_task.strftime('%H:%M:%S') if not pd.isna(start_task) else None,
                                            t4=end_task.strftime('%H:%M:%S') if not pd.isna(end_task) else None,
                                            t5=start_rec.strftime('%H:%M:%S') if not pd.isna(start_rec) else None,
                                            t6=end_rec.strftime('%H:%M:%S') if not pd.isna(end_rec) else None,
                                            save_csv=True)


In [42]:
import this


The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [None]:
make_protocol_rows(home, y)

In [None]:
dfs = []

# Loop over each csv file in the data directory
for csv_file in home.glob('**/*protocol_averages.csv'):
    # Read in the csv file as a DataFrame and append to the list
    df = pd.read_csv(csv_file)
    dfs.append(df)

# Concatenate the list of DataFrames into a single DataFrame
merged = pd.concat(dfs, ignore_index=True)

# Print the merged DataFrame
merged