In [1]:
import tqdm
# Autoreload possibly interferes with IntelliJ debugging
%reload_ext autoreload
%autoreload 2
import logging
class FlushHandler(logging.StreamHandler):
    def emit(self, record):
        super().emit(record)
        self.flush()

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[FlushHandler()])
log = lambda msg: logging.info(msg)


# Full pipeline (multiple files)

In [2]:
import pandas as pd
import os

input_dir = "C:\\dev\\play\\brainwave-data-day"
stats_df = pd.read_csv(input_dir + os.path.sep + "day_stats.csv")
# stats_df = pd.read_csv("C:\\dev\\play\\brainwave-data\\stats.csv")

## Convert Brainflow files to FIF

In [3]:
def get_brainflow_compressed_filename(full_input_filename: str) -> str:
    full_output_dirname = webserver.output_dirname(full_input_filename)
    compressed_full_output_filename = str(os.path.join(full_output_dirname, os.path.basename(full_input_filename))) + '.bz2'
    return compressed_full_output_filename

In [4]:
from datetime import datetime

import webserver
import convert
# import zstandard as zstd
import os
import bz2
import time
import shutil
from tqdm.notebook import trange, tqdm

errors = []
processed = []

# Could get these working later
skip_list = []

def compress_bz2(input_file, output_file):
    start_time = time.time()
    with open(input_file, 'rb') as f_in:
        with bz2.open(output_file, 'wb', compresslevel=9) as f_out:
            shutil.copyfileobj(f_in, f_out)
    end_time = time.time()
    return end_time - start_time, os.path.getsize(output_file)

for root, dirs, files in os.walk(input_dir):
    files = [file for file in files if file.endswith(".brainflow.csv")]
    # Exclude the last file, which we assume to be the most recent, and possibly still being written
    for idx, file_name in tqdm(enumerate(files[:-1]), desc="Processing directories", total=len(files) - 1):  
        full_input_filename = os.path.join(root, file_name)
        try:
            full_output_dirname = webserver.output_dirname(full_input_filename)
            full_output_filename = str(os.path.join(full_output_dirname, 'raw.fif'))
            
            compressed_full_output_filename = get_brainflow_compressed_filename(full_input_filename)
            
            if not os.path.exists(compressed_full_output_filename):
                log(f"Compressing file {full_input_filename} to " + compressed_full_output_filename)
                processed.append("Compressing " + full_input_filename)
                try:
                    os.mkdir(os.path.dirname(compressed_full_output_filename))
                except:
                    pass
                compress_bz2(full_input_filename, compressed_full_output_filename) 
                
            if os.path.exists(full_output_filename):
                log(f"Skipping file {full_input_filename} as {full_output_filename} and {compressed_full_output_filename} already exist")
                continue
            should_skip = False
            for s in skip_list:
                if s in full_input_filename:
                    log(f"Skipping file {full_input_filename}")
                    should_skip = True
            if not should_skip:
                log(f"Processing file {full_input_filename}")
                processed.append("Processing " + full_input_filename)
                channels = ['Fpz-M1']
                date_time_str = os.path.basename(full_input_filename).removesuffix(".brainflow.csv")
                date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d-%H-%M-%S')
    
                if channels is not None:
                    log(f"Processing file {full_input_filename} with channels {channels}")
                    convert.convert_and_save_brainflow_file(log, full_input_filename, full_output_filename, channels)

        except Exception as e:
            msg = "Error processing file: " + full_input_filename
            log(msg)
            log(e)
            errors.append(msg)


Processing directories:   0%|          | 0/4 [00:00<?, ?it/s]

2024-11-16 11:28:35,519 - INFO - Skipping file C:\dev\play\brainwave-data-day\2024-11-15-09-10-50.brainflow.csv as C:\dev\play\brainwave-data-day\2024-11-15-09-10-50\raw.fif and C:\dev\play\brainwave-data-day\2024-11-15-09-10-50\2024-11-15-09-10-50.brainflow.csv.bz2 already exist
2024-11-16 11:28:35,521 - INFO - Skipping file C:\dev\play\brainwave-data-day\2024-11-15-09-22-47.brainflow.csv as C:\dev\play\brainwave-data-day\2024-11-15-09-22-47\raw.fif and C:\dev\play\brainwave-data-day\2024-11-15-09-22-47\2024-11-15-09-22-47.brainflow.csv.bz2 already exist
2024-11-16 11:28:35,522 - INFO - Skipping file C:\dev\play\brainwave-data-day\2024-11-15-09-26-33.brainflow.csv as C:\dev\play\brainwave-data-day\2024-11-15-09-26-33\raw.fif and C:\dev\play\brainwave-data-day\2024-11-15-09-26-33\2024-11-15-09-26-33.brainflow.csv.bz2 already exist
2024-11-16 11:28:35,523 - INFO - Compressing file C:\dev\play\brainwave-data-day\2024-11-16-10-59-45.brainflow.csv to C:\dev\play\brainwave-data-day\2024-11-

Processing directories:   0%|          | 0/3 [00:00<?, ?it/s]

Processing directories:   0%|          | 0/3 [00:00<?, ?it/s]

Processing directories:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:

errors

['Error processing file: C:\\dev\\play\\brainwave-data-day\\2024-11-16-10-59-45.brainflow.csv']

In [6]:
processed

['Compressing C:\\dev\\play\\brainwave-data-day\\2024-11-16-10-59-45.brainflow.csv',
 'Processing C:\\dev\\play\\brainwave-data-day\\2024-11-16-10-59-45.brainflow.csv']

## Run pipeline on FIF files

In [17]:
import contextlib
import io
import run_day_pipeline
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import trange, tqdm

errors = []
dataframes = []

# Could get these working later
skip_list = []

def process_file(root, dir_name):
    input_file = os.path.join(root, dir_name, "raw.fif")
    if dir_name in skip_list:
        log(f"Skipping {dir_name}: " + input_file)
        return None
    try:
        log(f"Processing file: " + input_file)
        if os.path.exists(input_file):
            yasa_df = run_day_pipeline.cached_pipeline(log, input_file, stats_df)
            #log(f"Returning {yasa_df.head()}")
            return yasa_df
    except Exception as e:
        msg = f"Error processing file: " + input_file + " - " + str(e)
        log(msg)
        errors.append(msg)
        log(e)
    return None

for root, dirs, files in os.walk(input_dir):
    for dir_name in tqdm(dirs, desc="Processing directories", total=len(dirs)):
        output_buffer = io.StringIO()
        with contextlib.redirect_stdout(output_buffer), contextlib.redirect_stderr(output_buffer):
            df = process_file(root, dir_name)
            if df is not None:
                dataframes.append(df)

all = pd.concat(dataframes)
log(f"Finished processing, have {len(all)} files in total")

Processing directories:   0%|          | 0/3 [00:00<?, ?it/s]

2024-11-16 11:35:29,418 - INFO - Processing file: C:\dev\play\brainwave-data-day\2024-11-15-09-10-50\raw.fif
2024-11-16 11:35:29,420 - INFO - Loading cached file C:\dev\play\brainwave-data-day\2024-11-15-09-10-50\raw.output.csv
2024-11-16 11:35:29,431 - INFO - Cached file C:\dev\play\brainwave-data-day\2024-11-15-09-10-50\raw.output.csv is missing DayEnergy, rebuilding
2024-11-16 11:35:29,432 - INFO - Loading MNE file C:\dev\play\brainwave-data-day\2024-11-15-09-10-50\raw.fif
2024-11-16 11:35:29,433 - INFO - Reading file C:\dev\play\brainwave-data-day\2024-11-15-09-10-50\raw.fif
2024-11-16 11:35:29,456 - INFO - Finished reading file C:\dev\play\brainwave-data-day\2024-11-15-09-10-50\raw.fif
2024-11-16 11:35:29,520 - INFO - Start date: 2024-11-15 09:10:50.479561+00:00 end 2024-11-15 09:22:06.719561+00:00 duration 0:11:16.240000 channels: ['Fpz-M1'] sfreq: 250.0
2024-11-16 11:35:29,657 - INFO - Memory Usage: 505.71 MB GC to 505.71 MB
2024-11-16 11:35:29,658 - INFO - Saving as EDF
2024-11

Processing directories: 0it [00:00, ?it/s]

Processing directories: 0it [00:00, ?it/s]

Processing directories: 0it [00:00, ?it/s]

ValueError: No objects to concatenate

In [8]:
dataframes

[    epoch                          TimestampUK  Fpz-M1_eeg_abspow  \
 0       0  2024-11-15 09:10:50.479561090+00:00       7.072715e-09   
 1       1  2024-11-15 09:11:20.479561090+00:00       2.076430e-09   
 2       2  2024-11-15 09:11:50.479561090+00:00       4.383291e-09   
 3       3  2024-11-15 09:12:20.479561090+00:00       1.534609e-09   
 4       4  2024-11-15 09:12:50.479561090+00:00       3.456014e-09   
 5       5  2024-11-15 09:13:20.479561090+00:00       1.229371e-09   
 6       6  2024-11-15 09:13:50.479561090+00:00       1.208887e-09   
 7       7  2024-11-15 09:14:20.479561090+00:00       6.455377e-09   
 8       8  2024-11-15 09:14:50.479561090+00:00       6.425787e-09   
 9       9  2024-11-15 09:15:20.479561090+00:00       5.458992e-09   
 10     10  2024-11-15 09:15:50.479561090+00:00       3.031416e-09   
 11     11  2024-11-15 09:16:20.479561090+00:00       2.131056e-09   
 12     12  2024-11-15 09:16:50.479561090+00:00       3.173361e-09   
 13     13  2024-11-

In [9]:
errors

[]

# Recalculate scalings
N.b. can be run frequently but will only be picked up by new runs.  Maybe worth occasionally regenerating all old files.
And yes, for new features have to rerun the pipeline on everything, then generate the stats here, then rerun the pipeline again on everything to have them use those.

In [10]:
from scaling import only_eeg

only_eeg_cols = list(only_eeg(all).columns)
assert any(col.startswith("Main") for col in only_eeg_cols), "No column starting with 'Main' found in only_eeg_cols"

In [11]:
import scaling

stats = scaling.stats(all)
stats.to_csv(input_dir + "/day_stats.csv")
assert any(stats['Column'].str.startswith("Main")), "No row starting with 'Main' found in column_name"
stats

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Unnamed: 0,Column,Mean,P10,P90,Min,Max,StdDev
0,Fpz-M1_eeg_abspow,2.567575e-09,5.564690e-10,4.683932e-09,4.706757e-11,1.712247e-08,1.865730e-09
1,Fpz-M1_eeg_abspow_c7min_norm,-1.651346e-02,-3.526303e-01,2.363201e-01,-8.923340e-01,8.949664e-01,2.959370e-01
2,Fpz-M1_eeg_abspow_p2min_norm,4.754711e-02,-2.935286e-01,4.393382e-01,-5.630067e-01,1.303757e+00,3.177617e-01
3,Fpz-M1_eeg_alpha,1.772328e-02,8.460448e-03,2.751043e-02,3.557820e-03,1.783833e-01,1.459351e-02
4,Fpz-M1_eeg_alpha_c7min_norm,6.357691e-02,-2.245662e-01,3.666280e-01,-5.374830e-01,1.396771e+00,3.086038e-01
...,...,...,...,...,...,...,...
427,Main_eeg_thetaabsaa_c7min_norm_s,5.468189e-01,1.202499e-01,8.495354e-01,-1.014553e+00,1.417242e+00,3.565414e-01
428,Main_eeg_thetaabsaa_p2min_norm_s,3.903255e-01,-9.043346e-02,8.676401e-01,-9.596298e-01,2.102499e+00,4.135871e-01
429,Main_eeg_thetaabsab_s,4.389722e-01,-1.342933e-01,1.038024e+00,-2.479550e-01,5.111267e+00,7.238551e-01
430,Main_eeg_thetaabsab_c7min_norm_s,7.349963e-01,2.462114e-01,1.076950e+00,-1.394602e+00,1.772395e+00,4.446799e-01


# Upload to GCS

In [18]:
from upload import upload_dir_to_gcs_skipping_existing
import os

errors = []
dataframes = []

for root, dirs, files in os.walk(input_dir):
    for dir_name in tqdm(reversed(dirs), desc="Uploading directories", total=len(dirs)):
        full_dir_name = os.path.join(root, dir_name)
        try:
            upload_dir_to_gcs_skipping_existing(log, 'examined-life-input-eeg-day', full_dir_name, dir_name)
        except Exception as e:
            log("Error processing file: " + input_dir)
            log(e)

for error in errors:
    log(error)

log("All uploaded")

Uploading directories:   0%|          | 0/3 [00:00<?, ?it/s]

2024-11-16 13:14:46,571 - INFO - Skipping C:\dev\play\brainwave-data-day\2024-11-15-09-26-33\2024-11-15-09-26-33.brainflow.csv.bz2, identical file already exists in GCS.
2024-11-16 13:14:46,779 - INFO - Skipping C:\dev\play\brainwave-data-day\2024-11-15-09-26-33\raw.edf, identical file already exists in GCS.
2024-11-16 13:14:47,151 - INFO - Skipping C:\dev\play\brainwave-data-day\2024-11-15-09-26-33\raw.fif, identical file already exists in GCS.
2024-11-16 13:14:47,368 - INFO - C:\dev\play\brainwave-data-day\2024-11-15-09-26-33\raw.output.csv uploading to 2024-11-15-09-26-33/raw.output.csv, does_not_exist=False has_changed=True
2024-11-16 13:14:49,101 - INFO - Uploaded directory checksum for C:\dev\play\brainwave-data-day\2024-11-15-09-26-33 to 2024-11-15-09-26-33/directory_checksum.md5
2024-11-16 13:14:50,849 - INFO - Skipping C:\dev\play\brainwave-data-day\2024-11-15-09-22-47\2024-11-15-09-22-47.brainflow.csv.bz2, identical file already exists in GCS.
2024-11-16 13:14:51,291 - INFO -

Uploading directories: 0it [00:00, ?it/s]

Uploading directories: 0it [00:00, ?it/s]

Uploading directories: 0it [00:00, ?it/s]

2024-11-16 13:14:56,114 - INFO - All uploaded


# Check if can delete Brainwave files that are safely backed up

In [13]:

import os

errors = []
can_delete = []
cannot_delete = []

for root, dirs, files in os.walk(input_dir):
    for idx, file_name in enumerate(files):
        full_input_filename = os.path.join(root, file_name)
        if full_input_filename.endswith(".brainflow.csv"):
            compressed_full_output_filename = get_brainflow_compressed_filename(full_input_filename)
                
            if os.path.exists(compressed_full_output_filename):
                can_delete.append({
                    'backed_up': compressed_full_output_filename,
                    'full_filename': full_input_filename
                })
            else:
                cannot_delete.append(full_input_filename)
    

In [14]:
can_delete_df = pd.DataFrame(can_delete)
can_delete_df

Unnamed: 0,backed_up,full_filename
0,C:\dev\play\brainwave-data-day\2024-11-15-09-1...,C:\dev\play\brainwave-data-day\2024-11-15-09-1...
1,C:\dev\play\brainwave-data-day\2024-11-15-09-2...,C:\dev\play\brainwave-data-day\2024-11-15-09-2...
2,C:\dev\play\brainwave-data-day\2024-11-15-09-2...,C:\dev\play\brainwave-data-day\2024-11-15-09-2...
3,C:\dev\play\brainwave-data-day\2024-11-16-10-5...,C:\dev\play\brainwave-data-day\2024-11-16-10-5...


In [15]:
cannot_delete

[]

In [16]:
can_delete_df['full_filename']

0    C:\dev\play\brainwave-data-day\2024-11-15-09-1...
1    C:\dev\play\brainwave-data-day\2024-11-15-09-2...
2    C:\dev\play\brainwave-data-day\2024-11-15-09-2...
3    C:\dev\play\brainwave-data-day\2024-11-16-10-5...
Name: full_filename, dtype: object