In [6]:
# Autoreload possibly interferes with IntelliJ debugging
# %reload_ext autoreload
# %autoreload 2
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
log = lambda msg: logging.info(msg)


# Full pipeline (multiple files)

In [7]:
import pandas as pd
import os

input_dir = "C:\\dev\\play\\brainwave-data"
stats_df = pd.read_csv(input_dir + os.path.sep + "stats.csv")

## Convert Brainflow files to FIF

In [8]:
def get_brainflow_compressed_filename(full_input_filename: str) -> str:
    full_output_dirname = webserver.output_dirname(full_input_filename)
    compressed_full_output_filename = str(os.path.join(full_output_dirname, os.path.basename(full_input_filename))) + '.bz2'
    return compressed_full_output_filename

In [9]:
from datetime import datetime

import webserver
import convert
# import zstandard as zstd
import os
import bz2
import time
import shutil

errors = []
processed = []

# Could get these working later
skip_list = ['2024-09-10-21-22-21']

def compress_bz2(input_file, output_file):
    start_time = time.time()
    with open(input_file, 'rb') as f_in:
        with bz2.open(output_file, 'wb', compresslevel=9) as f_out:
            shutil.copyfileobj(f_in, f_out)
    end_time = time.time()
    return end_time - start_time, os.path.getsize(output_file)

for root, dirs, files in os.walk(input_dir):
    #for idx, file_name in enumerate(tqdm(dirs, desc="Converting Brainflow to FIF")):
    for idx, file_name in enumerate(files):
        full_input_filename = os.path.join(root, file_name)
        try:
            if full_input_filename.endswith(".brainflow.csv"):
                full_output_dirname = webserver.output_dirname(full_input_filename)
                full_output_filename = str(os.path.join(full_output_dirname, 'raw.fif'))
                
                compressed_full_output_filename = get_brainflow_compressed_filename(full_input_filename)
                
                if not os.path.exists(compressed_full_output_filename):
                    log(f"Compressing file {full_input_filename} to " + compressed_full_output_filename)
                    processed.append("Compressing " + full_input_filename)
                    try:
                        os.mkdir(os.path.dirname(compressed_full_output_filename))
                    except:
                        pass
                    compress_bz2(full_input_filename, compressed_full_output_filename) 
                    
                if os.path.exists(full_output_filename):
                    log(f"Skipping file {full_input_filename} as {full_output_filename} and {compressed_full_output_filename} already exist")
                    continue
                should_skip = False
                for s in skip_list:
                    if s in full_input_filename:
                        log(f"Skipping file {full_input_filename}")
                        should_skip = True
                if not should_skip:
                    log(f"Processing file {full_input_filename}")
                    processed.append("Processing " + full_input_filename)
                    channels = None
                    date_time_str = os.path.basename(full_input_filename).removesuffix(".brainflow.csv")
                    date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d-%H-%M-%S')
                    if (date_time_obj > datetime(2024, 9, 1, 0, 0, 0)):
                        channels = ['Fpz']
        
                    if channels is not None:
                        log(f"Processing file {full_input_filename} with channels {channels}")
                        convert.convert_and_save_brainflow_file(log, full_input_filename, full_output_filename, channels)

        except Exception as e:
            msg = "Error processing file: " + full_input_filename
            log(msg)
            log(e)
            errors.append(msg)


In [10]:
errors

[]

In [11]:
processed

[]

## Run pipeline on FIF files

In [12]:
import pandas as pd
import run_feature_pipeline
import os

errors = []
dataframes = []

# Could get these working later
skip_list = ['2024-07-23-22-40-25', '2024-07-28-22-29-49', '2024-09-18-21-25-08', '2024-09-18-21-28-11', '2024-09-19-21-29-42']

for root, dirs, files in os.walk(input_dir):
    # for idx, dir_name in enumerate(tqdm(dirs, desc="Running pipeline")):
    for idx, dir_name in enumerate(dirs):
        input_file = os.path.join(root, dir_name, "raw.fif")
        if dir_name in skip_list:
            log(f"Skipping {idx} of {len(dirs)}: " + input_file)
            continue
        try:
            log(f"Processing file {idx} of {len(dirs)}: " + input_file)
                        
            if os.path.exists(input_file):                
                yasa_df = run_feature_pipeline.cached_pipeline(log, input_file, stats_df)
                dataframes.append(yasa_df)
        except Exception as e:
            msg = f"Error processing file {idx} of {len(dirs)}: " + input_file + " - " + str(e)
            log(msg)
            errors.append(msg)
            log(e)

for error in errors:
    log(error)

all = pd.concat(dataframes)
log(f"Finished processing, have {len(all)} files of {idx} total")


2024-10-03 19:21:49,702 - INFO - Processing file 0 of 55: C:\dev\play\brainwave-data\08-07-2024--22-51-16\raw.fif
2024-10-03 19:21:49,702 - INFO - Processing file 1 of 55: C:\dev\play\brainwave-data\09-07-2024--22-52-25\raw.fif
2024-10-03 19:21:49,703 - INFO - Processing file 2 of 55: C:\dev\play\brainwave-data\2024-07-08-22-51-16\raw.fif
2024-10-03 19:21:49,705 - INFO - Loading cached file C:\dev\play\brainwave-data\2024-07-08-22-51-16\raw.with_features.csv
2024-10-03 19:21:49,740 - INFO - Processing file 3 of 55: C:\dev\play\brainwave-data\2024-07-11-22-46-18\raw.fif
2024-10-03 19:21:49,740 - INFO - Loading cached file C:\dev\play\brainwave-data\2024-07-11-22-46-18\raw.with_features.csv
2024-10-03 19:21:49,815 - INFO - Processing file 4 of 55: C:\dev\play\brainwave-data\2024-07-12-22-38-58\raw.fif
2024-10-03 19:21:49,815 - INFO - Loading cached file C:\dev\play\brainwave-data\2024-07-12-22-38-58\raw.with_features.csv
2024-10-03 19:21:49,883 - INFO - Processing file 5 of 55: C:\dev\pl

In [13]:
errors

[]

In [14]:
# yasa_df

# Recalculate scalings
N.b. can be run frequently but will only be picked up by new runs.  Maybe worth occasionally regenerating all old files.

In [15]:
import scaling

stats = scaling.stats(all)
stats.to_csv(input_dir + "stats.csv")
stats

Unnamed: 0,Column,Mean,P10,P90,Min,Max,StdDev
0,F8-M1_eeg_abspow,1.921968e-07,4.772690e-11,9.516579e-10,8.534915e-35,7.769647e-04,9.439601e-06
1,F8-M1_eeg_abspow_c7min_norm,4.070216e+01,-8.968809e-02,5.672409e-01,-2.093740e-01,4.783267e+04,8.481547e+02
2,F8-M1_eeg_abspow_p2min_norm,4.897505e+01,-6.837685e-02,5.577247e-01,-2.503299e-01,7.904638e+04,1.356728e+03
3,F8-M1_eeg_alpha,6.848900e-02,1.244090e-02,1.253551e-01,8.034852e-05,3.983974e-01,4.803263e-02
4,F8-M1_eeg_alpha_c7min_norm,-3.206412e-02,-5.190218e-01,3.788195e-01,-7.588421e-01,1.428529e+00,3.329315e-01
...,...,...,...,...,...,...,...
970,Fpz_eeg_svdent_c7min_norm_s,5.113044e-18,0.000000e+00,0.000000e+00,-7.105427e-15,7.105427e-15,3.184371e-16
971,Fpz_eeg_svdent_p2min_norm_s,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
972,Fpz_eeg_theta_s,4.607205e-01,-2.131948e-02,9.361571e-01,-1.621347e-01,2.960850e+00,3.563970e-01
973,Fpz_eeg_theta_c7min_norm_s,5.782887e-01,1.919589e-04,1.023071e+00,-2.815854e-01,1.576032e+00,3.674755e-01


# Upload to GCS

In [16]:
from upload import upload_dir_to_gcs_skipping_existing
import os

errors = []
dataframes = []

for root, dirs, files in os.walk(input_dir):
    for dir_name in reversed(dirs):
        input_file = os.path.join(root, dir_name, "raw.fif")
        full_dir_name = os.path.join(root, dir_name)
        try:
            upload_dir_to_gcs_skipping_existing(log, 'examined-life-derived-eeg', full_dir_name, dir_name)
        except Exception as e:
            log("Error processing file: " + input_dir)
            errors.append("Error processing file: " + input_file + " - " + str(e))
            log(e)

for error in errors:
    log(error)

log("All uploaded")

2024-10-03 19:21:59,813 - INFO - Skipping upload of C:\dev\play\brainwave-data\nonnight, identical directory already exists in GCS.
2024-10-03 19:22:01,616 - INFO - Skipping upload of C:\dev\play\brainwave-data\compressed_files, identical directory already exists in GCS.
2024-10-03 19:22:03,618 - INFO - Skipping C:\dev\play\brainwave-data\2024-10-02-21-21-15\2024-10-02-21-21-15.brainflow.csv.bz2, identical file already exists in GCS.
2024-10-03 19:22:03,672 - INFO - C:\dev\play\brainwave-data\2024-10-02-21-21-15\raw.average_slow_wave.png uploading to 2024-10-02-21-21-15/raw.average_slow_wave.png, does_not_exist=False has_changed=True
2024-10-03 19:22:04,023 - INFO - C:\dev\play\brainwave-data\2024-10-02-21-21-15\raw.edf uploading to 2024-10-02-21-21-15/raw.edf, does_not_exist=False has_changed=True
2024-10-03 19:22:17,620 - INFO - Skipping C:\dev\play\brainwave-data\2024-10-02-21-21-15\raw.fif, identical file already exists in GCS.
2024-10-03 19:22:17,677 - INFO - C:\dev\play\brainwave

In [17]:
upload_dir_to_gcs_skipping_existing(log, 'examined-life-derived-eeg', "C:\\dev\\play\\brainwave-data\\2024-07-12-22-38-58", "2024-07-12-22-38-58")

2024-10-03 19:24:56,647 - INFO - Skipping upload of C:\dev\play\brainwave-data\2024-07-12-22-38-58, identical directory already exists in GCS.


# Delete Brainwave files that are safely backed up

In [18]:

import os

errors = []
can_delete = []
cannot_delete = []

for root, dirs, files in os.walk(input_dir):
    for idx, file_name in enumerate(files):
        full_input_filename = os.path.join(root, file_name)
        if full_input_filename.endswith(".brainflow.csv"):
            compressed_full_output_filename = get_brainflow_compressed_filename(full_input_filename)
                
            if os.path.exists(compressed_full_output_filename):
                can_delete.append({
                    'backed_up': compressed_full_output_filename,
                    'full_filename': full_input_filename
                })
            else:
                cannot_delete.append(full_input_filename)
    

In [19]:
can_delete_df = pd.DataFrame(can_delete)
can_delete_df

In [20]:
cannot_delete

[]

In [21]:
can_delete_df['full_filename']

KeyError: 'full_filename'

# Delete Cyton files that are safely backed up

In [None]:
import webserver
import bz2
import shutil
import os
import time


errors = []
can_delete = []
cannot_delete = []

for root, dirs, files in os.walk(input_dir):
    for idx, file_name in enumerate(files):
        full_input_filename = os.path.join(root, file_name)
        if file_name.startswith("OBCI_") and file_name.endswith(".TXT.bz2"):
            can_delete.append({
                'file_name': file_name.removesuffix(".bz2"),
                'full_filename': full_input_filename
            })

In [None]:
can_delete

In [None]:
cyton_file_locations = [input_dir, "d:", "e:", "x:"]

for cyton_file_location in cyton_file_locations:
    for root, dirs, files in os.walk(cyton_file_location):
        for idx, file in enumerate(files):
            matching_record = next((f for f in can_delete if f['file_name'] == file), None)
            if matching_record:
                full_filename = os.path.join(root, file)
                log(f"Could delete {full_filename} as backed up in {matching_record['full_filename']}")