In [None]:
# --- paths & setup (portable) ---
from pathlib import Path
import os, json, pickle
import pandas as pd
import numpy as np
from pathlib import Path

from mosaic.path_utils import CFG, raw_path, proc_path, project_root

# RAW Box folder name for this dataset
DATASET_RAW = "INNERSPEECH"   
# Processed target: ~/.../DATA/innerspeech/preprocessed
LOCAL_DATA_DIR = proc_path(str(DATASET_RAW).lower())
PREPROC_DIR = proc_path(str(DATASET_RAW).lower(), "preprocessed")
CACHE_DIR   = PREPROC_DIR / "cache"

# (optional) repo-root helpers if want to import local modules
ROOT = project_root()

# make sure dirs exist
PREPROC_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

RAW_DIR = raw_path(DATASET_RAW)

print("BOX_ROOT  :", CFG["box_root"])
print("RAW_DIR   :", RAW_DIR)
print("LOCAL_DATA:", LOCAL_DATA_DIR)
print("PREPROC   :", PREPROC_DIR)
print("CACHE_DIR :", CACHE_DIR)


BOX_ROOT  : /Users/rb666/Library/CloudStorage/Box-Box/TMDATA
RAW_DIR   : /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/INNERSPEECH
LOCAL_DATA: /Users/rb666/Projects/MOSAIC/DATA/innerspeech
PREPROC   : /Users/rb666/Projects/MOSAIC/DATA/innerspeech/preprocessed
CACHE_DIR : /Users/rb666/Projects/MOSAIC/DATA/innerspeech/preprocessed/cache


In [None]:
# --- Load reflection reports CSV ---
csv_path = os.path.join(LOCAL_DATA_DIR, f"{str(DATASET_RAW).lower()}_reflection_reports.csv")
print("CSV:", csv_path)

if not Path(csv_path).exists():
    raise FileNotFoundError(f"Missing file: {csv_path}")

#load only the reflection_answers column
df = pd.read_csv(csv_path, usecols=["reflection_answer"])
n_reports = df.shape[0]
print(f"Loaded {n_reports} reports from {csv_path}")
print(df.shape)
df.head()


CSV: /Users/rb666/Projects/MOSAIC/DATA/innerspeech/innerspeech_reflection_reports.csv
Loaded 731 reports from /Users/rb666/Projects/MOSAIC/DATA/innerspeech/innerspeech_reflection_reports.csv
(731, 1)


Unnamed: 0,reflection_answer
0,頭の中の独り言をこのような調査で改めて自覚することができ、また色々なパターンがあることを知り...
1,他人の声が脳内でしている人がいるという話にすごく興味があるのですが、心理物理実験で音声のパラ...
2,頭の中では日本語で考えているという自覚はある（英語は勉強以外にほぼ使ったことはない）が、文字...
3,自動思考というものなのか、直近で起きた失敗などを批判する考えが勝手に浮かんできたりすることが...
4,食べたいもの、欲しいものなどは、自分がそれを食べている、或いは使っているところを想像して決め...


### Sample translate .csv file multilangual into English (using Gemini API)

In [None]:




import pandas as pd
import google.generativeai as genai
from google.api_core import exceptions
import os
from dotenv import load_dotenv
from tqdm import tqdm
import time
import json
import numpy as np
import random

# --- SETUP AND CONFIGURATION ---

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("API key not found. Please set the GOOGLE_API_KEY in your .env file.")
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-flash')

try:
    csv_path = '/Users/rbeaute/Projects/MOSAIC/DATA/multilingual/japanese/innerspeech/innerspeech_reflection_reports.csv'
    df = pd.read_csv(csv_path)
    print(f"Successfully loaded {len(df)} rows to be translated from {csv_path}")
except FileNotFoundError:
    print(f"Error: The file could not be found at {csv_path}")
    df = pd.DataFrame({'reflection_answer': []}) # Create empty df if file not found

# --- BATCH TRANSLATION FUNCTION WITH EXPONENTIAL BACKOFF ---

def translate_batch_with_retry(texts: list[str], max_retries: int = 3) -> list[str]:
    """
    Translates a BATCH of texts, with automatic retries for rate limit errors.
    """
    numbered_texts = "\\n".join([f'"{i+1}": "{text}"' for i, text in enumerate(texts)])
    prompt = f"""Translate each of the following numbered Japanese texts to English.
Please return the result as a single, valid JSON object where keys are the numbers and values are the English translations.
The JSON object should have exactly {len(texts)} elements. Do not include any other explanatory text in your response.

TEXTS TO TRANSLATE:
{{
{numbered_texts}
}}
"""
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            cleaned_response_text = response.text.strip().replace("```json", "").replace("```", "")
            translated_dict = json.loads(cleaned_response_text)
            translated_texts = [translated_dict.get(str(i+1), "Error: Missing translation") for i in range(len(texts))]

            if len(translated_texts) == len(texts):
                return translated_texts
            else:
                return ["Error: Mismatch in batch response"] * len(texts)

        except exceptions.ResourceExhausted as e:
            print(f"Rate limit exceeded. Waiting to retry... (Attempt {attempt + 1}/{max_retries})")
            retry_after = 15 * (2 ** attempt) + random.uniform(0, 1) # Exponential backoff with jitter
            print(f"Waiting for {retry_after:.2f} seconds.")
            time.sleep(retry_after)

        except Exception as e:
            print(f"An unexpected error occurred during a batch translation: {e}")
            return [f"Error: {e}"] * len(texts)
    
    print("All retries failed for this batch.")
    return ["Error: Max retries exceeded"] * len(texts)

# --- EXECUTE THE BATCHED TRANSLATION ---
if not df.empty:
    BATCH_SIZE = 20
    all_translations = []

    text_batches = np.array_split(df['reflection_answer'].dropna(), len(df) // BATCH_SIZE + 1)
    print(f"Split {len(df)} entries into {len(text_batches)} batches of up to {BATCH_SIZE} each.")

    for batch in tqdm(text_batches, desc="Translating Batches"):
        if batch.empty:
            continue
        translations = translate_batch_with_retry(batch.tolist())
        all_translations.extend(translations)
        time.sleep(1)

    df['reflection_answer_english'] = pd.Series(all_translations)
    
    # --- REVIEW AND SAVE RESULTS ---
    print("\\n--- Translation Results (First 5 Rows) ---")
    print(df[['reflection_answer', 'reflection_answer_english']].head())

    output_path = '/Users/rbeaute/Projects/MOSAIC/DATA/multilingual/innerspeech_translated_batched.csv'
    df.to_csv(output_path, index=False)
    print(f"\\nTranslated data saved to {output_path}")
else:
    print("DataFrame is empty, skipping translation.")

In [None]:
for i in range(10):
    print(f"--- Document {i} ---")
    print("Original:", df.loc[i, 'reflection_answer'])
    print("Translated:", df.loc[i, 'reflection_answer_english'])
    print()

### Divide into sentences

In [None]:
import nltk
nltk.download('punkt')

# ----------------------------------------
reports = df['reflection_answer_english'].tolist()
print(f"Loaded {len(reports)} (translated) documents for BERTopic modeling.")
# ----------------------------------------
# Divide each report into sentences
reports_sentences = [nltk.sent_tokenize(report) for report in reports]

# Calculate the total number of sentences
sentences_per_report = [len(report) for report in reports_sentences] #keep track of the number of sentences in each report (for further analysis)
print(f"Number of sentences in each report (mapping): {sentences_per_report}")
print(f"Total number of sentences: {sum(sentences_per_report)}")


all_sentences = [sentence for report in reports_sentences for sentence in report]
print(f"Total number of sentences across all reports: {len(all_sentences)}") #sanity check, should match the sum above


In [None]:
import matplotlib.pyplot as plt

# Calculate stats for the distribution of sentences per report
sentences_array = np.array(sentences_per_report)
mean_sentences = np.mean(sentences_array)
median_sentences = np.median(sentences_array)
std_sentences = np.std(sentences_array)
min_sentences = np.min(sentences_array)
max_sentences = np.max(sentences_array)

print(f"Mean sentences per report: {mean_sentences:.2f}")
print(f"Median sentences per report: {median_sentences}")
print(f"Standard deviation: {std_sentences:.2f}")
print(f"Minimum sentences in a report: {min_sentences}")
print(f"Maximum sentences in a report: {max_sentences}")


# Plot histogram
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(sentences_array, bins=30, color='skyblue', edgecolor='black')
plt.title('Histogram of Sentences per Report')
plt.xlabel('Number of Sentences')
plt.ylabel('Frequency')

# Plot boxplot
plt.subplot(1, 2, 2)
plt.boxplot(sentences_array, vert=False)
plt.title('Boxplot of Sentences per Report')
plt.xlabel('Number of Sentences')

plt.tight_layout()
plt.show()

In [None]:
# Calculate outlier thresholds using IQR method
Q1 = np.percentile(sentences_array, 25)
Q3 = np.percentile(sentences_array, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find indices of outlier reports
outlier_indices = np.where((sentences_array < lower_bound) | (sentences_array > upper_bound))[0]

# Print number of outlier reports
print(f"Number of outlier reports (by sentence count): {len(outlier_indices)}")

# Print content of outlier reports
for idx in outlier_indices:
    print(f"\nReport index: {idx}, Sentence count: {sentences_array[idx]}")
    print("Sentences:")
    for sent in reports_sentences[idx]:
        print(f"- {sent}")