In [1]:
# --- paths & setup (portable) ---
from pathlib import Path
import os, json, pickle
import pandas as pd
import numpy as np
from pathlib import Path

from mosaic.path_utils import CFG, raw_path, proc_path, project_root

# RAW Box folder name for this dataset
DATASET_RAW = "INNERSPEECH"   
# Processed target: ~/.../DATA/innerspeech/preprocessed
LOCAL_DATA_DIR = proc_path(str(DATASET_RAW).lower())
PREPROC_DIR = proc_path(str(DATASET_RAW).lower(), "preprocessed")
CACHE_DIR   = PREPROC_DIR / "cache"

# (optional) repo-root helpers if want to import local modules
ROOT = project_root()

# make sure dirs exist
PREPROC_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

RAW_DIR = raw_path(DATASET_RAW)

print("BOX_ROOT  :", CFG["box_root"])
print("RAW_DIR   :", RAW_DIR)
print("LOCAL_DATA:", LOCAL_DATA_DIR)
print("PREPROC   :", PREPROC_DIR)
print("CACHE_DIR :", CACHE_DIR)


BOX_ROOT  : /Users/rb666/Library/CloudStorage/Box-Box/TMDATA
RAW_DIR   : /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/INNERSPEECH
LOCAL_DATA: /Users/rb666/Projects/MOSAIC/DATA/innerspeech
PREPROC   : /Users/rb666/Projects/MOSAIC/DATA/innerspeech/preprocessed
CACHE_DIR : /Users/rb666/Projects/MOSAIC/DATA/innerspeech/preprocessed/cache


In [2]:
# --- Load reflection reports CSV ---
csv_path = os.path.join(LOCAL_DATA_DIR, f"{str(DATASET_RAW).lower()}_reflection_reports.csv")
print("CSV:", csv_path)

if not Path(csv_path).exists():
    raise FileNotFoundError(f"Missing file: {csv_path}")

#load only the reflection_answers column
df = pd.read_csv(csv_path, usecols=["reflection_answer"])
n_reports = df.shape[0]
print(f"Loaded {n_reports} reports from {csv_path}")
print(df.shape)
df.head()


CSV: /Users/rb666/Projects/MOSAIC/DATA/innerspeech/innerspeech_reflection_reports.csv
Loaded 731 reports from /Users/rb666/Projects/MOSAIC/DATA/innerspeech/innerspeech_reflection_reports.csv
(731, 1)


Unnamed: 0,reflection_answer
0,頭の中の独り言をこのような調査で改めて自覚することができ、また色々なパターンがあることを知り...
1,他人の声が脳内でしている人がいるという話にすごく興味があるのですが、心理物理実験で音声のパラ...
2,頭の中では日本語で考えているという自覚はある（英語は勉強以外にほぼ使ったことはない）が、文字...
3,自動思考というものなのか、直近で起きた失敗などを批判する考えが勝手に浮かんできたりすることが...
4,食べたいもの、欲しいものなどは、自分がそれを食べている、或いは使っているところを想像して決め...


### Sample translate .csv file multilangual into English (using Gemini API)

In [None]:
import google.generativeai as genai
from google.api_core import exceptions
from dotenv import load_dotenv
from tqdm import tqdm
import time
import json
import random

# --- SETUP AND CONFIGURATION ---
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("API key not found. Please set the GOOGLE_API_KEY in your .env file.")

def _configure_gemini(api_key: str):
    # Prefer v1; if this SDK doesn’t accept api_version, just ignore
    try:
        genai.configure(api_key=api_key, api_version="v1")
    except TypeError:
        genai.configure(api_key=api_key)
    # sanity: show where we ended up
    try:
        info = genai.get_model("models/gemini-info")
        # not all SDKs have this; ignore if it fails
    except Exception:
        pass

_configure_gemini(api_key)



def pick_model(preferred=(
    # Prefer modern, available names first (as listed by your SDK)
    "gemini-2.5-flash",
    "gemini-flash-latest",
    "gemini-2.5-pro",
    # fallbacks (keep if your tenant exposes them)
    "gemini-2.0-flash",
    "gemini-pro-latest",
    # legacy aliases last
    "gemini-1.5-flash-002",
    "gemini-1.5-pro-002",
)):
    avail = {}
    try:
        for m in genai.list_models():
            methods = set(getattr(m, "supported_generation_methods", []) or [])
            name = m.name.split("/")[-1]
            if "generateContent" in methods:
                avail[name] = True
                # uncomment if you want to see what’s available
                # print(f"- {name} supports: {sorted(methods)}")
    except Exception as e:
        print("Could not list models:", e)

    # 1) choose the first preferred that is available
    for name in preferred:
        if not avail or name in avail:
            print("Using model:", name)
            return name

    # 2) if none of the preferred names matched but we DO have availables,
    #    pick a sensible fast default from what we saw.
    if avail:
        for candidate in ("gemini-2.5-flash", "gemini-flash-latest"):
            if candidate in avail:
                print("Using model:", candidate)
                return candidate
        # otherwise just pick any available
        picked = next(iter(avail.keys()))
        print("Using available model:", picked)
        return picked

    print("Falling back to gemini-2.5-flash")
    return "gemini-2.5-flash"

chosen_model_name = pick_model()
model = genai.GenerativeModel(model_name=chosen_model_name)


Using model: gemini-2.5-flash


In [6]:
# --- BATCH TRANSLATION FUNCTION WITH EXPONENTIAL BACKOFF ---
def translate_batch_with_retry(texts: list[str], max_retries: int = 3) -> list[str]:
    """
    Translates a BATCH of texts, with automatic retries for rate limit errors.
    """
    numbered_texts = "\n".join([f'"{i+1}": "{text}"' for i, text in enumerate(texts)])
    prompt = f"""Translate each of the following numbered Japanese texts to English.
Please return the result as a single, valid JSON object where keys are the numbers and values are the English translations.
The JSON object should have exactly {len(texts)} elements. Do not include any other explanatory text in your response.

TEXTS TO TRANSLATE:
{{
{numbered_texts}
}}
"""

    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            cleaned = response.text.strip().replace("```json", "").replace("```", "")
            translated_dict = json.loads(cleaned)
            translated_texts = [translated_dict.get(str(i+1), "Error: Missing translation") for i in range(len(texts))]
            if len(translated_texts) == len(texts):
                return translated_texts
            return ["Error: Mismatch in batch response"] * len(texts)

        except exceptions.ResourceExhausted:
            wait_s = 15 * (2 ** attempt) + random.uniform(0, 1)  # backoff + jitter
            print(f"Rate limit exceeded. Waiting {wait_s:.1f}s (attempt {attempt+1}/{max_retries})")
            time.sleep(wait_s)

        except Exception as e:
            print(f"Unexpected error during batch translation: {e}")
            return [f"Error: {e}"] * len(texts)

    print("All retries failed for this batch.")
    return ["Error: Max retries exceeded"] * len(texts)



# --- EXECUTE THE BATCHED TRANSLATION ---
if not df.empty:
    BATCH_SIZE = 20
    all_translations = []

    series = df['reflection_answer'].dropna().astype(str)
    # split into batches of up to BATCH_SIZE
    batches = [series.iloc[i:i+BATCH_SIZE] for i in range(0, len(series), BATCH_SIZE)]
    print(f"Split {len(series)} entries into {len(batches)} batches of up to {BATCH_SIZE} each.")

    for batch in tqdm(batches, desc="Translating Batches"):
        translations = translate_batch_with_retry(batch.tolist())
        all_translations.extend(translations)
        time.sleep(1)  # gentle pacing

    # align back to df indices
    df = df.copy()
    df.loc[series.index, 'reflection_answer_english'] = all_translations

    # --- REVIEW AND SAVE RESULTS ---
    print("\n--- Translation Results (First 5 Rows) ---")
    print(df[['reflection_answer', 'reflection_answer_english']].head())

    # SAVE translated CSV into preprocessed folder
    translated_csv = PREPROC_DIR / "innerspeech_translated_batched_API.csv"
    df.to_csv(translated_csv, index=False)
    print(f"\nTranslated data saved to {translated_csv}")
else:
    print("DataFrame is empty, skipping translation.")

Split 731 entries into 37 batches of up to 20 each.


Translating Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Translating Batches: 100%|██████████| 37/37 [18:02<00:00, 29.26s/it]


--- Translation Results (First 5 Rows) ---
                                   reflection_answer  \
0  頭の中の独り言をこのような調査で改めて自覚することができ、また色々なパターンがあることを知り...   
1  他人の声が脳内でしている人がいるという話にすごく興味があるのですが、心理物理実験で音声のパラ...   
2  頭の中では日本語で考えているという自覚はある（英語は勉強以外にほぼ使ったことはない）が、文字...   
3  自動思考というものなのか、直近で起きた失敗などを批判する考えが勝手に浮かんできたりすることが...   
4  食べたいもの、欲しいものなどは、自分がそれを食べている、或いは使っているところを想像して決め...   

                           reflection_answer_english  
0  Through this survey, I was able to re-recogniz...  
1  I'm very interested in the idea that some peop...  
2  I am aware that I think in Japanese in my head...  
3  Perhaps it's automatic thought, but ideas crit...  
4  When deciding what I want to eat or what I wan...  

Translated data saved to /Users/rb666/Projects/MOSAIC/DATA/innerspeech/preprocessed/innerspeech_translated_batched_API.csv





In [None]:
preview_n = min(10, len(df))
for i in range(preview_n):
    print(f"--- Document {i} ---")
    orig = df.iloc[i]['reflection_answer']
    trans = df.iloc[i].get('reflection_answer_english', None)
    print("Original :", orig if isinstance(orig, str) else str(orig))
    print("Translated:", trans if isinstance(trans, str) else "(no translation)")
    print()

### Divide into sentences

In [None]:
# import nltk
# nltk.download('punkt')

# # ----------------------------------------
# reports = df['reflection_answer_english'].tolist()
# print(f"Loaded {len(reports)} (translated) documents for BERTopic modeling.")
# # ----------------------------------------
# # Divide each report into sentences
# reports_sentences = [nltk.sent_tokenize(report) for report in reports]

# # Calculate the total number of sentences
# sentences_per_report = [len(report) for report in reports_sentences] #keep track of the number of sentences in each report (for further analysis)
# print(f"Number of sentences in each report (mapping): {sentences_per_report}")
# print(f"Total number of sentences: {sum(sentences_per_report)}")


# all_sentences = [sentence for report in reports_sentences for sentence in report]
# print(f"Total number of sentences across all reports: {len(all_sentences)}") #sanity check, should match the sum above

# %% [markdown]
# ### Divide into sentences

# %%
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Choose text source: prefer translated, else fall back to original
if 'reflection_answer_english' in df.columns and df['reflection_answer_english'].notna().any():
    reports = df['reflection_answer_english'].fillna("").astype(str).tolist()
    print("Using translated English text for sentence splitting.")
else:
    reports = df['reflection_answer'].fillna("").astype(str).tolist()
    print("Using original text for sentence splitting.")

print(f"Loaded {len(reports)} documents for sentence splitting.")

# Split into sentences
reports_sentences = [nltk.sent_tokenize(report) for report in reports]

# Sentence counts
sentences_per_report = [len(report) for report in reports_sentences]
total_sentences = sum(sentences_per_report)
print(f"Number of sentences in each report (mapping): {sentences_per_report}")
print(f"Total number of sentences: {total_sentences}")

all_sentences = [sentence for report in reports_sentences for sentence in report]
print(f"Total number of sentences across all reports: {len(all_sentences)}")  # sanity check

# Optionally cache the sentences for downstream steps
np.save(CACHE_DIR / "docs_sentences.npy", np.array(all_sentences, dtype=object))
print("Saved sentences →", CACHE_DIR / "docs_sentences.npy")



In [None]:
import matplotlib.pyplot as plt

# Calculate stats for the distribution of sentences per report
sentences_array = np.array(sentences_per_report)
mean_sentences = np.mean(sentences_array)
median_sentences = np.median(sentences_array)
std_sentences = np.std(sentences_array)
min_sentences = np.min(sentences_array)
max_sentences = np.max(sentences_array)

print(f"Mean sentences per report: {mean_sentences:.2f}")
print(f"Median sentences per report: {median_sentences}")
print(f"Standard deviation: {std_sentences:.2f}")
print(f"Minimum sentences in a report: {min_sentences}")
print(f"Maximum sentences in a report: {max_sentences}")


# Plot histogram
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(sentences_array, bins=30, color='skyblue', edgecolor='black')
plt.title('Histogram of Sentences per Report')
plt.xlabel('Number of Sentences')
plt.ylabel('Frequency')

# Plot boxplot
plt.subplot(1, 2, 2)
plt.boxplot(sentences_array, vert=False)
plt.title('Boxplot of Sentences per Report')
plt.xlabel('Number of Sentences')

plt.tight_layout()
plt.show()

fig_path = PREPROC_DIR / "sentences_per_report_stats.png"
plt.savefig(fig_path, dpi=300, bbox_inches="tight")
print("Saved figure →", fig_path)

In [None]:
# %%
# Calculate outlier thresholds using IQR method
sentences_array = np.array(sentences_per_report)
Q1 = np.percentile(sentences_array, 25)
Q3 = np.percentile(sentences_array, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find indices of outlier reports
outlier_indices = np.where((sentences_array < lower_bound) | (sentences_array > upper_bound))[0]

print(f"Number of outlier reports (by sentence count): {len(outlier_indices)}")

# Print content of outlier reports
for idx in outlier_indices[:20]:  # avoid spamming if many
    print(f"\nReport index: {idx}, Sentence count: {sentences_array[idx]}")
    print("Sentences:")
    for sent in reports_sentences[idx]:
        print(f"- {sent}")

# Optional: save outlier indices
np.save(CACHE_DIR / "outlier_report_indices.npy", outlier_indices)
print("Saved outlier indices →", CACHE_DIR / "outlier_report_indices.npy")
