In [5]:
import os
import sys
import numpy as np
import pandas as pd

dataset = "dreamachine"

print(f"Current working directory: {os.getcwd()}")
BOX_DIR = os.path.join(os.path.expanduser("~"), "Library", "CloudStorage", "Box-Box", "TMDATA")
print(f"Retrieving data from BOX, locally stored at: {BOX_DIR}")
DATA_DIR = os.path.join(BOX_DIR, dataset)
print(f"Data directory: {DATA_DIR}")


reports_file = os.listdir(DATA_DIR)
print(f"Files for {dataset} dataset (n={len(os.listdir(DATA_DIR))}): {os.listdir(DATA_DIR)}")


# load only the 'reflection_answer' column from each CSV file for HS and DL
hs_reflection = pd.read_csv(os.path.join(DATA_DIR, 'freeform_HS_SensoryTool_complete.csv'), usecols=['reflection_answer']).dropna()
dl_reflection = pd.read_csv(os.path.join(DATA_DIR, 'freeform_DL_SensoryTool_complete.csv'), usecols=['reflection_answer']).dropna()

print("HS reflection_answer shape:", hs_reflection.shape)
print("DL reflection_answer shape:", dl_reflection.shape)

Current working directory: /Users/rb666/Projects/MOSAIC/DATA/multilingual/english/dreamachine
Retrieving data from BOX, locally stored at: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA
Data directory: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/dreamachine
Files for dreamachine dataset (n=5): ['freeform_HS_SensoryTool_complete.csv', 'freeform_DL_SensoryTool_complete.csv', 'SensoryTool_CombinedData_v24_20230912_2.xlsx', 'HS_reflections_APIcleaned.csv', 'DL_reflections_APIcleaned.csv']
HS reflection_answer shape: (336, 1)
DL reflection_answer shape: (98, 1)


In [6]:
hs_reflection

Unnamed: 0,reflection_answer
39,Intense chaos. And then my mind checked out an...
44,good stuff.\n
58,a pattern of red and white lights thatflashed ...
81,i wentback to many hard and mostly beautiful m...
88,Hello
...,...
4971,Changing temperature of my body with the light...
4975,pleasure & intrigue
4981,travelling through space\n
4985,i thought alot about rands journey into rhudia...


In [7]:

from dotenv import load_dotenv
import google.generativeai as genai
from tqdm import tqdm
import json

load_dotenv()

api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("API key not found. Please set the GOOGLE_API_KEY environment variable in our .env file.")
    sys.exit(1) 
genai.configure(api_key=api_key)

model =genai.GenerativeModel('gemini-1.5-flash')


def clean_batch_with_gemini(texts: list[str]) -> list[str]:
    """
    Uses the Gemini API to clean a BATCH of text reports in a single API call.

    Args:
        texts: A list of raw text strings.

    Returns:
        A list of cleaned text strings.
    """
    # Create a numbered list of the texts to include in the prompt.
    numbered_texts = "\n".join([f"{i+1}. {text}" for i, text in enumerate(texts)])
    
    # detailed prompt to instructs the AI on how to process the batch and how to format the response as a JSON array.
    prompt = f"""
Please act as a data cleaning expert. Your task is to clean each of the following numbered texts.

Follow these rules precisely:
1.  For each text, correct spelling mistakes, fix grammar, and remove artifacts like '\\n'.
2.  Do NOT change the original meaning or remove punctuation.
3.  Return the result as a single, valid JSON array of strings.
4.  The JSON array must have exactly {len(texts)} elements, where each string is a cleaned version of the corresponding input text.
5.  Do not include the numbers or any other commentary in your output, only the JSON array.

TEXTS TO CLEAN:
---
{numbered_texts}
---
"""
    
    try:
        response = model.generate_content(prompt)
        # Clean the response to ensure it's a valid JSON string
        cleaned_response_text = response.text.strip().replace("```json", "").replace("```", "")
        
        # Parse the JSON string into a Python list
        cleaned_texts = json.loads(cleaned_response_text)
        
        if len(cleaned_texts) == len(texts):
            return cleaned_texts
        else:
            print(f"Warning: Mismatch in batch response size. Expected {len(texts)}, got {len(cleaned_texts)}.")
            return ["Error: Mismatch in batch response"] * len(texts)
            
    except Exception as e:
        print(f"An error occurred during a batch cleaning: {e}")
        return [f"Error: {e}"] * len(texts)


In [8]:
#execute the cleaning in batches
BATCH_SIZE = 20 
all_cleaned_hs = []
all_cleaned_dl = []

# --- Process the HS reports ---
print(f"Processing {len(hs_reflection)} HS reports in batches of {BATCH_SIZE}...")
# split the DataFrame into batches
hs_batches = np.array_split(hs_reflection['reflection_answer'], len(hs_reflection) // BATCH_SIZE)

# Process each batch of HS reflections and collect cleaned results
for batch in tqdm(hs_batches, desc="Cleaning HS batches"):
    cleaned = clean_batch_with_gemini(batch.tolist())
    all_cleaned_hs.extend(cleaned)

hs_reflection['cleaned_reflection'] = all_cleaned_hs

# --- Process the DL reports ---
print(f"Processing {len(dl_reflection)} DL reports in batches of {BATCH_SIZE}...")
dl_batches = np.array_split(dl_reflection['reflection_answer'], len(dl_reflection) // BATCH_SIZE)
for batch in tqdm(dl_batches, desc="Cleaning DL batches"):
    cleaned = clean_batch_with_gemini(batch.tolist())
    all_cleaned_dl.extend(cleaned)

dl_reflection['cleaned_reflection'] = all_cleaned_dl

  return bound(*args, **kwds)


Processing 336 HS reports in batches of 20...


Cleaning HS batches: 100%|██████████| 16/16 [03:39<00:00, 13.74s/it]
  return bound(*args, **kwds)


Processing 98 DL reports in batches of 20...


Cleaning DL batches: 100%|██████████| 4/4 [00:30<00:00,  7.71s/it]


In [9]:
# Review the results: show a summary and a few sample cleaned reflections for both HS and DL

print("HS Reflection DataFrame:")
display(hs_reflection.head(10))
print(f"\nTotal HS reflections: {len(hs_reflection)}")

print("\nDL Reflection DataFrame:")
display(dl_reflection.head(10))
print(f"\nTotal DL reflections: {len(dl_reflection)}")


print("\nSample cleaned HS reflections:")
print(hs_reflection['cleaned_reflection'].sample(5, random_state=42).to_list())

print("\nSample cleaned DL reflections:")
print(dl_reflection['cleaned_reflection'].sample(5, random_state=42).to_list())

HS Reflection DataFrame:


Unnamed: 0,reflection_answer,cleaned_reflection
39,Intense chaos. And then my mind checked out an...,Intense chaos. And then my mind checked out an...
44,good stuff.\n,Good stuff.
58,a pattern of red and white lights thatflashed ...,A pattern of red and white lights that flashed...
81,i wentback to many hard and mostly beautiful m...,I went back to many hard and mostly beautiful ...
88,Hello,Hello.
89,Hope as a colour,Hope as a color.
101,dreaming while awake- flashes of random places...,Dreaming while awake—flashes of random places ...
118,\nBeing: immersed; calm; and thrilled.,Being: immersed; calm; and thrilled.
134,life after retired\n,Life after retirement.
135,i dreamt myself as a harbinger of the new plan...,I dreamt myself as a harbinger of the new plan...



Total HS reflections: 336

DL Reflection DataFrame:


Unnamed: 0,reflection_answer,cleaned_reflection
1,thecoloursmade patterns when i had my eyes clo...,The colours made patterns when I had my eyes c...
19,not sure what the puroose of it was except bei...,Not sure what the purpose of it was except bei...
71,thus should be available for everyone all the ...,This should be available for everyone all the ...
100,"detachment as though my body was in one box, m...","Detachment, as though my body was in one box, ..."
111,a doorway,A doorway.
116,"some visuals of trees, heat,sun likeexperience\n","Some visuals of trees, heat, sun-like experience."
121,Peacefulness and calm. At times I felt transpo...,Peacefulness and calm. At times I felt transpo...
132,"relaxed and peaceful, felt the beats of the mu...",Relaxed and peaceful; felt the beats of the mu...
134,i was struggling because my mind would go back...,I was struggling because my mind would go back...
157,"It was a bit weird, i know that mybrain is try...",It was a bit weird; I know that my brain is tr...



Total DL reflections: 98

Sample cleaned HS reflections:
['Trying to remember the patterns.', 'Strobe lighting, very orange and yellow dominant colours, like the old kaleidoscopes we had as kids. Main experience was visual; no other senses involved.', 'I also experienced ghost people in my peripheral vision. Sometimes fleeting, or I felt they were walking around the room. Wispy and milky in appearance. I am also dyslexic, and it was interesting to experience as a neurodivergent-brained person.', 'Sea of green stars moving like oil on water, fading. Glimpse of something disappearing.\n\nShifting to sharp angled reds, bold lines intersecting like a gate on black.\n\nSwirling spheres of orange, turning into a spiral. When a sense of what is happening seems to arrive, the vision vibrates and the sense leaves.', 'Lots of colors.']

Sample cleaned DL reflections:
['My visual experience would change a lot with the music. I became more and more relaxed as the music progressed further with its

In [10]:
#save into CSV files
hs_reflection.to_csv(os.path.join(DATA_DIR, 'HS_reflections_APIcleaned.csv'), index=False)
dl_reflection.to_csv(os.path.join(DATA_DIR, 'DL_reflections_APIcleaned.csv'), index=False)


In [11]:
#check where it has been saved
print(f"Cleaned HS reflections saved to: {os.path.join(DATA_DIR, 'HS_reflections_APIcleaned.csv')}")

Cleaned HS reflections saved to: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/dreamachine/HS_reflections_APIcleaned.csv


In [12]:
hs_reflection

Unnamed: 0,reflection_answer,cleaned_reflection
39,Intense chaos. And then my mind checked out an...,Intense chaos. And then my mind checked out an...
44,good stuff.\n,Good stuff.
58,a pattern of red and white lights thatflashed ...,A pattern of red and white lights that flashed...
81,i wentback to many hard and mostly beautiful m...,I went back to many hard and mostly beautiful ...
88,Hello,Hello.
...,...,...
4971,Changing temperature of my body with the light...,Changing temperature of my body with the light...
4975,pleasure & intrigue,Pleasure & intrigue.
4981,travelling through space\n,Travelling through space.
4985,i thought alot about rands journey into rhudia...,I thought a lot about Rand's journey into Rhui...
