In [1]:

import torch
import os
import numpy as np
import nltk
import sys
import pandas as pd
import nltk
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from umap import UMAP
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from bertopic.representation import KeyBERTInspired, LlamaCPP,MaximalMarginalRelevance
import datamapplot
import random



# Add project root to path
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)  
sys.path.append(project_root)

# Import project modules
from src.preprocessor import split_sentences
from src.utils import get_params_grid, calculate_coherence,calculate_embedding_coherence
from src.model import setup_model, setup_umap, setup_hdbscan
from src.llama_CPP_custom import *
from configs.dreamachine2 import config


dataset = "DREAMACHINE"


print(f"Current working directory: {os.getcwd()}")
BOX_DIR = os.path.join(os.path.expanduser("~"), "Library", "CloudStorage", "Box-Box", "TMDATA")
print(f"Retrieving data from BOX, locally stored at: {BOX_DIR}")
DATA_DIR = os.path.join(BOX_DIR, dataset)
print(f"Data directory: {DATA_DIR}")
results_dir = os.path.join(project_root, "EVAL",dataset.lower())


Current working directory: /Users/rb666/Projects/MOSAIC/EVAL
Retrieving data from BOX, locally stored at: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA
Data directory: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/DREAMACHINE


In [14]:
condition = "DL"
reports_path_DL = os.path.join(DATA_DIR, f"{condition}_reflections_APIcleaned.csv")
print("Using data from:", reports_path_DL)

# load data and divide into sentences if needed
df_reports_dl = pd.read_csv(reports_path_DL)['cleaned_reflection'].dropna().reset_index(drop=True)
print(f"n = {len(df_reports_dl)} reports loaded from {reports_path_DL}")

#check if duplicated or empty reports, or reports with only one word
print("Checking for duplicated or empty reports, or reports with only one word...")
df_reports_dl = df_reports_dl[df_reports_dl.str.strip().astype(bool)]  # Remove empty or whitespace-only strings
df_reports_dl = df_reports_dl[df_reports_dl.str.split().str.len() > 1]  # Remove strings with only one word
df_reports_dl = df_reports_dl.drop_duplicates().reset_index(drop=True)  # Remove duplicates
print(f"n = {len(df_reports_dl)} reports remaining after cleaning.")

Using data from: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/DREAMACHINE/DL_reflections_APIcleaned.csv
n = 98 reports loaded from /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/DREAMACHINE/DL_reflections_APIcleaned.csv
Checking for duplicated or empty reports, or reports with only one word...
n = 92 reports remaining after cleaning.


In [18]:
df_reports_dl = split_sentences(df_reports_dl)[0]
print(f"\nSuccessfully loaded and processed {len(df_reports_dl)} sentences.")


#remove sentences defined as too short
min_words = 2 #threshold for minimum words in a sentence
#print all sentences that contain only min_words word
for i, sentence in enumerate(df_reports_dl):
    if len(sentence.split()) < min_words:
        print(sentence)

#print the amount of sentences that have less than min_words words
short_sentences = [sentence for sentence in df_reports_dl if len(sentence.split()) < min_words]
print(f"\nThere are {len(short_sentences)} sentences with less than {min_words} words.\n")

# Remove sentences with less than 2 words
df_reports_dl = [sentence for sentence in df_reports_dl if len(sentence.split()) >= min_words]
print(f"After removing short sentences, {len(df_reports_dl)} sentences remain.")

# Remove duplicate sentences if any
seen = set()
df_reports_dl = [s for s in df_reports_dl if not (s in seen or seen.add(s))]
print(f"After removing duplicates, {len(df_reports_dl)} remain.")


Successfully loaded and processed 199 sentences.
Blue.

There are 1 sentences with less than 2 words.

After removing short sentences, 198 sentences remain.
After removing duplicates, 198 remain.


In [16]:
condition = "HS"
reports_path_hs = os.path.join(DATA_DIR, f"{condition}_reflections_APIcleaned.csv")
print("Using data from:", reports_path_hs) 
# load data and divide into sentences if needed
df_reports_hs = pd.read_csv(reports_path_hs)['cleaned_reflection'].dropna().reset_index(drop=True)
print(f"n = {len(df_reports_hs)} reports loaded from {reports_path_hs}")

#check if duplicated or empty reports, or reports with only one word
print("Checking for duplicated or empty reports, or reports with only one word...")
df_reports_hs = df_reports_hs[df_reports_hs.str.strip().astype(bool)]  # Remove empty or whitespace-only strings
df_reports_hs = df_reports_hs[df_reports_hs.str.split().str.len() > 1]  # Remove strings with only one word
df_reports_hs = df_reports_hs.drop_duplicates().reset_index(drop=True)  # Remove duplicates
print(f"n = {len(df_reports_hs)} reports remaining after cleaning.")


Using data from: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/DREAMACHINE/HS_reflections_APIcleaned.csv
n = 333 reports loaded from /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/DREAMACHINE/HS_reflections_APIcleaned.csv
Checking for duplicated or empty reports, or reports with only one word...
n = 315 reports remaining after cleaning.


In [17]:


df_reports_hs = split_sentences(df_reports_hs)[0]
print(f"\nSuccessfully loaded and processed {len(df_reports_hs)} sentences.")
#remove sentences defined as too short
min_words = 2 #threshold for minimum words in a sentence
#print all sentences that contain only min_words words
for i, sentence in enumerate(df_reports_hs):
    if len(sentence.split()) < min_words:
        print(sentence)
#print the amount of sentences that have less than min_words words
short_sentences = [sentence for sentence in df_reports_hs if len(sentence.split()) < min_words]
print(f"\nThere are {len(short_sentences)} sentences with less than {min_words} words.\n")
# Remove sentences with less than 2 words
df_reports_hs = [sentence for sentence in df_reports_hs if len(sentence.split()) >= min_words]
print(f"After removing short sentences, {len(df_reports_hs)} sentences remain.")
# Remove duplicate sentences if any
seen = set()
df_reports_hs = [s for s in df_reports_hs if not (s in seen or seen.add(s))]
print(f"After removing duplicates, {len(df_reports_hs)} remain.")

df_reports_hs


Successfully loaded and processed 716 sentences.
Confusion.
Vast.
Immense.
Tessellations.
Hexagons.
Starfields.
Beautiful!
Thanks!
Bizarre.
Pareidolia.
Soothed.
Tired.
Joy.
Peace.
Sunlight.
True.

There are 16 sentences with less than 2 words.

After removing short sentences, 700 sentences remain.
After removing duplicates, 700 remain.


['Intense chaos.',
 'And then my mind checked out and my subconscious took over and started talking.',
 "What I imagine it's like looking back on life before you die.",
 'Good stuff.',
 'A pattern of red and white lights that flashed and became more intense when the lights flashed intensely.',
 'I also felt asleep for a bit and I was thinking of my partner, who I could visualize.',
 'I went back to many hard and mostly beautiful memories without prompting them at all.',
 'Hope as a color.',
 'Dreaming while awake—flashes of random places I have been.',
 "Maybe this is what it's like to be dead.",
 'Being: immersed; calm; and thrilled.',
 'Life after retired.',
 'I dreamt myself as a harbinger of the new planet, shaping the world with life and equity.',
 'I wish to belong there with someone I really loved and missed...',
 'Relaxation, calm, and curiosity.',
 'Personally, it was hard for me to find peace with the intensity of the bright light.',
 "My eyes wouldn't stop watering, so that 

In [6]:
##
# --- Descriptive Statistics of Reports ---
##

print("\n" + "="*50)
print("Calculating Descriptive Statistics for Reports")
print("="*50 + "\n")

# It's important to work with the original reports, not the split sentences.
# We'll reload the data to ensure we're looking at each report as a whole.
original_reports_dl = pd.read_csv(reports_path_DL)['cleaned_reflection'].dropna().reset_index(drop=True)
original_reports_hs = pd.read_csv(reports_path_hs)['cleaned_reflection'].dropna().reset_index(drop=True)

print(f"Analyzing {len(original_reports_dl)} original DL reports.")
print(f"Analyzing {len(original_reports_hs)} original HS reports.")


# --- Calculate Counts Per Report ---

# Ensure the NLTK sentence tokenizer is available
# The `quiet=True` flag prevents verbose output during download.
nltk.download('punkt', quiet=True)

# Calculate word counts for each report
word_counts_dl = original_reports_dl.apply(lambda text: len(str(text).split()))
word_counts_hs = original_reports_hs.apply(lambda text: len(str(text).split()))

# Calculate sentence counts for each report
sentence_counts_dl = original_reports_dl.apply(lambda text: len(nltk.sent_tokenize(str(text))))
sentence_counts_hs = original_reports_hs.apply(lambda text: len(nltk.sent_tokenize(str(text))))


# --- Assemble the Statistics Table ---

# Create a dictionary to hold all the calculated stats
stats = {
    "HS Word Count": {
        "Mean": word_counts_hs.mean(),
        "SD": word_counts_hs.std(),
        "Min": word_counts_hs.min(),
        "Max": word_counts_hs.max()
    },
    "HS Sentence Count": {
        "Mean": sentence_counts_hs.mean(),
        "SD": sentence_counts_hs.std(),
        "Min": sentence_counts_hs.min(),
        "Max": sentence_counts_hs.max()
    },
    "DL Word Count": {
        "Mean": word_counts_dl.mean(),
        "SD": word_counts_dl.std(),
        "Min": word_counts_dl.min(),
        "Max": word_counts_dl.max()
    },
    "DL Sentence Count": {
        "Mean": sentence_counts_dl.mean(),
        "SD": sentence_counts_dl.std(),
        "Min": sentence_counts_dl.min(),
        "Max": sentence_counts_dl.max()
    }
}

# Convert the stats dictionary to a pandas DataFrame for a clean table format
descriptive_stats_df = pd.DataFrame.from_dict(stats, orient='index')

# Format the numbers for better readability
# We cast Min/Max to integers as they are whole numbers
descriptive_stats_df['Mean'] = descriptive_stats_df['Mean'].round(2)
descriptive_stats_df['SD'] = descriptive_stats_df['SD'].round(2)
descriptive_stats_df['Min'] = descriptive_stats_df['Min'].astype(int)
descriptive_stats_df['Max'] = descriptive_stats_df['Max'].astype(int)


# --- Display the Final Table ---
print("\n--- Table 2: Descriptive statistics for the free-text reports ---\n")
print(descriptive_stats_df)
print("\nMean, standard deviation (SD), minimum (Min), and maximum (Max) values are shown.")


Calculating Descriptive Statistics for Reports

Analyzing 98 original DL reports.
Analyzing 333 original HS reports.

--- Table 2: Descriptive statistics for the free-text reports ---

                    Mean     SD  Min  Max
HS Word Count      25.36  26.31    1  153
HS Sentence Count   2.21   1.77    1   10
DL Word Count      24.76  26.49    1  161
DL Sentence Count   2.12   2.04    1   15

Mean, standard deviation (SD), minimum (Min), and maximum (Max) values are shown.


In [19]:
##
# --- Descriptive Statistics of Reports ---
##

print("\n" + "="*50)
print("Calculating Descriptive Statistics for Reports")
print("="*50 + "\n")

# It's important to work with the original reports, not the split sentences.
# We'll reload the data to ensure we're looking at each report as a whole.
original_reports_dl = pd.read_csv(reports_path_DL)['cleaned_reflection'].dropna().reset_index(drop=True)
original_reports_hs = pd.read_csv(reports_path_hs)['cleaned_reflection'].dropna().reset_index(drop=True)

print(f"Analyzing {len(original_reports_dl)} original DL reports.")
print(f"Analyzing {len(original_reports_hs)} original HS reports.")


# --- Calculate Counts Per Report ---

# Ensure the NLTK sentence tokenizer is available
# The `quiet=True` flag prevents verbose output during download.
nltk.download('punkt', quiet=True)

# Calculate word counts for each report
word_counts_dl = original_reports_dl.apply(lambda text: len(str(text).split()))
word_counts_hs = original_reports_hs.apply(lambda text: len(str(text).split()))

# Calculate sentence counts for each report
sentence_counts_dl = original_reports_dl.apply(lambda text: len(nltk.sent_tokenize(str(text))))
sentence_counts_hs = original_reports_hs.apply(lambda text: len(nltk.sent_tokenize(str(text))))


# --- Assemble the Statistics Table ---

# Create a dictionary to hold all the calculated stats
stats = {
    "HS Word Count": {
        "Mean": word_counts_hs.mean(),
        "SD": word_counts_hs.std(),
        "Min": word_counts_hs.min(),
        "Max": word_counts_hs.max()
    },
    "HS Sentence Count": {
        "Mean": sentence_counts_hs.mean(),
        "SD": sentence_counts_hs.std(),
        "Min": sentence_counts_hs.min(),
        "Max": sentence_counts_hs.max()
    },
    "DL Word Count": {
        "Mean": word_counts_dl.mean(),
        "SD": word_counts_dl.std(),
        "Min": word_counts_dl.min(),
        "Max": word_counts_dl.max()
    },
    "DL Sentence Count": {
        "Mean": sentence_counts_dl.mean(),
        "SD": sentence_counts_dl.std(),
        "Min": sentence_counts_dl.min(),
        "Max": sentence_counts_dl.max()
    }
}

# Convert the stats dictionary to a pandas DataFrame for a clean table format
descriptive_stats_df = pd.DataFrame.from_dict(stats, orient='index')

# Format the numbers for better readability
# We cast Min/Max to integers as they are whole numbers
descriptive_stats_df['Mean'] = descriptive_stats_df['Mean'].round(2)
descriptive_stats_df['SD'] = descriptive_stats_df['SD'].round(2)
descriptive_stats_df['Min'] = descriptive_stats_df['Min'].astype(int)
descriptive_stats_df['Max'] = descriptive_stats_df['Max'].astype(int)


# --- Display the Final Table ---
print("\n--- Table 2: Descriptive statistics for the free-text reports ---\n")
print(descriptive_stats_df)
print("\nMean, standard deviation (SD), minimum (Min), and maximum (Max) values are shown.")


Calculating Descriptive Statistics for Reports

Analyzing 98 original DL reports.
Analyzing 333 original HS reports.

--- Table 2: Descriptive statistics for the free-text reports ---

                    Mean     SD  Min  Max
HS Word Count      25.36  26.31    1  153
HS Sentence Count   2.21   1.77    1   10
DL Word Count      24.76  26.49    1  161
DL Sentence Count   2.12   2.04    1   15

Mean, standard deviation (SD), minimum (Min), and maximum (Max) values are shown.


In [10]:
##
# --- Descriptive Statistics of Preprocessed Sentences ---
##

print("\n" + "="*50)
print("Calculating Descriptive Statistics for Preprocessed Sentences")
print("="*50 + "\n")

# --- Calculate Word Counts Per Sentence ---

# Use a list comprehension to get the word count for each sentence
# This is a concise way to create a new list based on an existing one.
word_counts_per_sentence_dl = [len(sentence.split()) for sentence in df_reports_dl]
word_counts_per_sentence_hs = [len(sentence.split()) for sentence in df_reports_hs]

print(f"Analyzing {len(word_counts_per_sentence_dl)} preprocessed DL sentences.")
print(f"Analyzing {len(word_counts_per_sentence_hs)} preprocessed HS sentences.")

# Convert lists to pandas Series to easily calculate statistics
word_counts_dl_series = pd.Series(word_counts_per_sentence_dl)
word_counts_hs_series = pd.Series(word_counts_per_sentence_hs)


# --- Assemble the Statistics Table ---

# Create a dictionary to hold the stats for word count per sentence
stats_sentences = {
    "HS Sentences": {
        "Mean": word_counts_hs_series.mean(),
        "SD": word_counts_hs_series.std(),
        "Min": word_counts_hs_series.min(),
        "Max": word_counts_hs_series.max()
    },
    "DL Sentences": {
        "Mean": word_counts_dl_series.mean(),
        "SD": word_counts_dl_series.std(),
        "Min": word_counts_dl_series.min(),
        "Max": word_counts_dl_series.max()
    }
}

# Convert the dictionary to a pandas DataFrame
descriptive_stats_sentences_df = pd.DataFrame.from_dict(stats_sentences, orient='index')

# Format the numbers for readability
descriptive_stats_sentences_df['Mean'] = descriptive_stats_sentences_df['Mean'].round(2)
descriptive_stats_sentences_df['SD'] = descriptive_stats_sentences_df['SD'].round(2)
descriptive_stats_sentences_df['Min'] = descriptive_stats_sentences_df['Min'].astype(int)
descriptive_stats_sentences_df['Max'] = descriptive_stats_sentences_df['Max'].astype(int)


# --- Display the Final Table ---
print("\n--- Descriptive statistics for Word Count per Sentence (after preprocessing) ---\n")
print(descriptive_stats_sentences_df)


Calculating Descriptive Statistics for Preprocessed Sentences

Analyzing 198 preprocessed DL sentences.
Analyzing 700 preprocessed HS sentences.

--- Descriptive statistics for Word Count per Sentence (after preprocessing) ---

               Mean    SD  Min  Max
HS Sentences  11.94  8.38    2   63
DL Sentences  12.22  7.06    2   43
