<a href="https://colab.research.google.com/github/robertopassaro/tales-of-2-minds/blob/main/tale_of_2_minds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -----------------------------------------------------------------------------
# IMPORT LIBRARIES
# -----------------------------------------------------------------------------

# COLLECTING THE TEXTS
!pip install sentence_transformers


# PREPROCESSING EXISTING TEXT FILES
import spacy

# METRICS
import os
import glob
import pandas as pd
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
from nltk import word_tokenize

#STATS
from scipy.stats import shapiro, levene, probplot
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
from scipy.stats import mannwhitneyu
import math

In [None]:
# LET'S CALL CHATGPT

import openai

# ----------------------------------------------------------------------------
# DIRECTORIES
# ----------------------------------------------------------------------------
GPT4_DIR = "gpt4.1"
HUMAN_DIR = "human_texts"

# ensure both directories are in place
os.makedirs(GPT4_DIR, exist_ok=True)
if not os.path.isdir(HUMAN_DIR):
    raise FileNotFoundError(f"Expected folder '{HUMAN_DIR}' of human stories not found")

# ----------------------------------------------------------------------------
# SETTINGS
# ----------------------------------------------------------------------------
# Directly embed the API key for Colab usage
API_KEY = None #insert your API key

MODEL = "gpt-4.1"
N_COMPLETIONS = 21
# Corrected temperature values as provided
TEMPERATURES = [0.001, 0.334, 0.667, 1.0, 1.334, 1.667, 2.0]

SYSTEM_INSTRUCTION = """
Please read the following prompt carefully and adopt the perspective of a writer.
Your task is to continue the story in any direction you choose.
The story should be longer than 200 words.
"""

PROMPT_TEXT = """
Cameron, after that very stressful day, fell asleep in her bed as usual.
The next day she opened her eyes and, to her great surprise, found herself...
"""

# Initialize the OpenAI client
openai.api_key = API_KEY

# ----------------------------------------------------------------------------
# GENERATE & SAVE GPT-4.1 COMPLETIONS AT DIFFERENT TEMPERATURES
# ----------------------------------------------------------------------------
for temperature in TEMPERATURES:
    temp_str = str(temperature).replace('.', '_')
    out_dir = os.path.join(GPT4_DIR, f"temp_{temp_str}")
    os.makedirs(out_dir, exist_ok=True)

    completions = []
    while len(completions) < N_COMPLETIONS:
        batch_size = min(10, N_COMPLETIONS - len(completions))
        # Use new OpenAI Python v1 interface for chat completions
        response = openai.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system",  "content": SYSTEM_INSTRUCTION},
                {"role": "user",    "content": PROMPT_TEXT},
            ],
            temperature=temperature,
            top_p=0.95,
            n=batch_size,
        )
        completions.extend(
            choice.message.content.strip()
            for choice in response.choices
        )

    for idx, text in enumerate(completions, start=1):
        filename = f"gpt4_{idx}.txt"
        path = os.path.join(out_dir, filename)
        with open(path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"Generated Temp={temperature} {idx}/{N_COMPLETIONS}: {path}")

In [None]:
# -----------------------------------------------------------------------------
# PREPROCESSING EXISTING TEXT FILES – Cleaning for gpt temperature folders AND Human stories
# -----------------------------------------------------------------------------

# Load spaCy model (do this once)
nlp = spacy.load("en_core_web_sm")

# Preprocessing function (define here so it’s in scope)
def preprocess_text(text: str) -> str:
    """
    Lemmatize, lowercase, and filter tokens by POS (NOUN, VERB, ADJ, ADV),
    removing stop words and non-alphabetic tokens.
    """
    doc = nlp(text)
    tokens = [
        token.lemma_.lower()
        for token in doc
        if token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"}
        and not token.is_stop
        and token.is_alpha
    ]
    return " ".join(tokens)

# Root directories
GPT_DIR = "gpt4.1"
HUMAN_DIR   = "human_texts"

# Helper to process a given raw directory

def clean_directory(raw_dir: str):
    clean_dir = os.path.join(raw_dir, "preprocessed")
    os.makedirs(clean_dir, exist_ok=True)
    print(f"\n Cleaning texts in '{raw_dir}' → '{clean_dir}'")
    for filename in sorted(os.listdir(raw_dir)):
        if not filename.endswith(".txt"):
            continue
        raw_path = os.path.join(raw_dir, filename)
        with open(raw_path, "r", encoding="utf-8") as f:
            text = f.read().strip()
        cleaned = preprocess_text(text)
        clean_name = filename.replace(".txt", "_clean.txt")
        clean_path = os.path.join(clean_dir, clean_name)
        with open(clean_path, "w", encoding="utf-8") as f:
            f.write(cleaned)
        print(f"  • {filename} → {clean_name}")

# Process all temperature subfolders under GPT_DIR
for temp_folder in sorted(os.listdir(GPT_DIR)):
    raw_temp_dir = os.path.join(GPT_DIR, temp_folder)
    if os.path.isdir(raw_temp_dir):
        clean_directory(raw_temp_dir)

# Process humaan_texts folder
if os.path.isdir(HUMAN_DIR):
    clean_directory(HUMAN_DIR)
else:
    raise FileNotFoundError(f"Expected folder '{HUMAN_DIR}' not found")

print("\n Cleaning complete for all directories.")

In [None]:
# -----------------------------------------------------------------------------
# LEXICAL DIVERSITY - configuration
# -----------------------------------------------------------------------------
# ensure the tokenizer data is installed
for pkg in ("punkt", "punkt_tab"):
    try:
        nltk.data.find(f"tokenizers/{pkg}")
    except LookupError:
        nltk.download(pkg, quiet=True)

# Dynamically locate preprocessed directories and extract temperature
PREPROC_DIRS = []
# Base directory for GPT-4.1 outputs
GPT_DIR = "gpt4.1"
# All temperature-specific preprocessed folders under GPT_DIR
for sub in sorted(os.listdir(GPT_DIR)):
    pre_dir = os.path.join(GPT_DIR, sub, "preprocessed")
    if os.path.isdir(prehuman_texts    # extract temperature string, e.g. sub = 'temp_0_001'
        temp_label = sub.replace('temp_', '').replace('_', '.')
        PREPROC_DIRS.append((pre_dir, "gpt-4.1", temp_label))
# Add human preprocessed
human_pre = os.path.join("human_texts", "preprocessed")
if os.path.isdir(human_human_textsREPROC_DIRS.append((human_pre, "human", "human"))

OUTPUT_SCORES   = "scores.csv"
OUTPUT_SUMMARY = "lexical_diversity_summary.csv"

# lexical_diversity function
def lexical_diversity(text: str, n: int = 1) -> float:
    tokens = word_tokenizehuman_texts())
    if len(tokens) < n:
 human_textsrn 0.0
    ngrams_list = list(ngrams(tokens, n))
    return len(sethuman_textst)) / len(ngrams_list)

# Compute lexical diversity for all files
results = []
for preproc_dir, source_label, temp_label in PREPROC_DIRS:
    # match all .txt files in the preprocessed folder
    pattern = os.path.join(preproc_dir, "*.txt")
    for path in sorted(glob.glob(pattern)):
        text = open(path, 'r', encoding='utf-8').read().strip()
        score = lexical_diversity(text, n=1)
        results.append({
            "source":            source_label,
            "temperature":       temp_label,
            "filename":          os.path.basename(path),
            "lexical_diversity": score,
        })

if not results:
    raise RuntimeError("No preprocessed files found. Check directory structure.")

# Create DataFrame of results
scores_df = pd.DataFrame(results)

# Print per-file scores
def format_label(row):
    return f"{row['source']}/temp={row['temperature']}/{row['filename']}"

for _, row in scores_df.iterrows():
    print(f"{format_label(row)}: lexical_diversity = {row['lexical_diversity']:.4f}")

# Save full per-file scores without raw text or prompt
scores_df.to_csv(
    OUTPUT_SCORES,
    index=False,
    columns=["source", "temperature", "filename", "lexical_diversity"]
)
print(f"\nSaved per-file scores to {OUTPUT_SCORES}")

# Grouped summary (mean TTR by source & temperature)
summary = (
    scores_df
    .groupby(["source", "temperature"])["lexical_diversity"]
    .mean()
    .reset_index()
)
summary.to_csv(OUTPUT_SUMMARY, index=False)
print(f"Saved summary to {OUTPUT_SUMMARY}\n")
print(summary)

In [None]:
# -----------------------------------------------------------------------------
# SEMANTIC DIVERSITY - configuration
# -----------------------------------------------------------------------------
# Inverse homogeneity: 1 − avg cosine similarity to all other docs in group
def inv_hom(doc_texts: list[str]) -> np.ndarray:
    n = len(doc_texts)
    if n < 2:
        return np.zeros(n, dtype=float)
    X = TfidfVectorizer().fit_transform(doc_texts)
    sim = cosine_similarity(X)
    avg_sim = (sim.sum(axis=1) - 1.0) / (n - 1)
    return 1.0 - avg_sim

# Path to master CSV and load
SCORES_CSV = "scores.csv"
df_scores = pd.read_csv(SCORES_CSV)
# Initialize semantic_diversity column
df_scores['semantic_diversity'] = np.nan

# Iterate each source-temperature group
groups = df_scores[['source', 'temperature']].drop_duplicates()
for _, row in groups.iterrows():
    source, temp = row['source'], row['temperature']
    # Determine preprocessed directory
    if source == 'human':
        pre_dir = os.path.join('human_texts', 'preprocessed')
    else:
        folder = f"temp_{str(temp).replace('.', '_')}"
        pre_dir = os.path.join('gpt4.1', folder, 'preprocessed')
    # Collect filenames for this group
    mask = (df_scores['source'] == source) & (df_scores['temperature'] == temp)
    filenames = df_scores.loc[mask, 'filename'].tolist()
    # Load texts
    docs = []
    valid_files = []
    for fname in sorted(filenames):
        path = os.path.join(pre_dir, fname)
        if os.path.isfile(path):
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read().strip()
            docs.append(text)
            valid_files.append(fname)
    # Skip if no docs found
    if not docs:
        continue
    # Compute semantic diversity
    sem_div = inv_hom(docs)
    # Map scores back and print
    for fname, score in zip(valid_files, sem_div):
        df_scores.loc[
            (df_scores['source'] == source) &
            (df_scores['temperature'] == temp) &
            (df_scores['filename'] == fname),
            'semantic_diversity'
        ] = score
        print(f"{source}/temp={temp}/{fname}: semantic_diversity = {score:.4f}")

# Save updated CSV
df_scores.to_csv(SCORES_CSV, index=False)
print(f"\n Updated {SCORES_CSV} with semantic_diversity column")

In [None]:
# -----------------------------------------------------------------------------
# NOVELTY – configuration
# -----------------------------------------------------------------------------

# Path to your master CSV
SCORES_CSV = "scores.csv"
df_scores = pd.read_csv(SCORES_CSV)
# Initialize novelty column
df_scores['novelty'] = np.nan

# Load embedding model once
model = SentenceTransformer("all-MiniLM-L6-v2")

# Determine unique source-temperature groups
groups = df_scores[['source', 'temperature']].drop_duplicates()
for _, row in groups.iterrows():
    source, temp = row['source'], row['temperature']
    # Identify corresponding preprocessed directory
    if source == 'human':
        pre_dir = os.path.join('human_texts', 'preprocessed')
    else:
        folder = f"temp_{str(temp).replace('.', '_')}"
        pre_dir = os.path.join('gpt4.1', folder, 'preprocessed')

    # Collect filenames for this group
    mask = (df_scores['source'] == source) & (df_scores['temperature'] == temp)
    filenames = df_scores.loc[mask, 'filename'].tolist()
    if not filenames:
        continue

    # Load texts in sorted order matching filenames
    texts = []
    valid_files = []
    for fname in sorted(filenames):
        path = os.path.join(pre_dir, fname)
        if os.path.isfile(path):
            with open(path, 'r', encoding='utf-8') as f:
                texts.append(f.read().strip())
            valid_files.append(fname)

    N = len(texts)
    if N == 0:
        continue

    # Encode and compute distances
    embeddings = model.encode(texts, convert_to_numpy=True)
    dist_mat = cosine_distances(embeddings)
    if N > 1:
        D_local = dist_mat.sum(axis=1) / (N - 1)
        D_global = dist_mat.sum() / (N * (N - 1))
    else:
        D_local = np.zeros(N)
        D_global = 0.0
    novelty_scores = 2 * np.abs(D_local - D_global)

    # Map scores back into df_scores and print
    for fname, score in zip(valid_files, novelty_scores):
        df_scores.loc[
            (df_scores['source'] == source) &
            (df_scores['temperature'] == temp) &
            (df_scores['filename'] == fname),
            'novelty'
        ] = score
        print(f"{source}/temp={temp}/{fname}: novelty = {score:.4f}")

# Save updated scores.csv
df_scores.to_csv(SCORES_CSV, index=False)
print(f"\n Updated {SCORES_CSV} with novelty column")

In [None]:
# -----------------------------------------------------------------------------
# SURPRISE – compute 2/(m-1)*Σ d(F_i, F_{i+1}) on RAW texts
# -----------------------------------------------------------------------------

# 1) Prep
nltk.download('punkt', quiet=True)
SCORES_CSV = "scores.csv"
df_scores = pd.read_csv(SCORES_CSV)

# Cast temperature to string so it matches temp_label later
df_scores['temperature'] = df_scores['temperature'].astype(str)

# (Re)initialize the column
df_scores['surprise'] = np.nan

# 2) Model
sur_model = SentenceTransformer("all-MiniLM-L6-v2")

def surprise_score(text: str) -> float:
    sents = sent_tokenize(text)
    m = len(sents)
    if m < 2:
        return 0.0
    embs = sur_model.encode(sents, convert_to_numpy=True)
    dists = [
        cosine_distances([embs[i]], [embs[i+1]])[0][0]
        for i in range(m - 1)
    ]
    return 2 * sum(dists) / (m - 1)

# 3) Gather raw‐text sources
raw_sources = []
for sub in sorted(os.listdir("gpt4.1")):
    raw_dir = os.path.join("gpt4.1", sub)
    if os.path.isdir(raw_dir):
        # same string format used in df_scores['temperature']
        temp_label = sub.replace("temp_", "").replace("_", ".")
        raw_sources.append((raw_dir, "gpt-4.1", temp_label))

if os.path.isdir("human_texts"):
    raw_sources.append(("human_texts", "human", "human"))
else:
    raise FileNotFoundError("Expected 'human_texts' directory not found")

# 4) Compute and assign
for raw_dir, src, temp_label in raw_sources:
    for path in sorted(glob.glob(os.path.join(raw_dir, "*.txt"))):
        text = open(path, encoding="utf-8").read().strip()
        score = surprise_score(text)
        clean_name = os.path.basename(path).replace(".txt", "_clean.txt")

        # Mask now compares strings to strings
        mask = (
            (df_scores["source"]      == src) &
            (df_scores["temperature"] == temp_label) &
            (df_scores["filename"]    == clean_name)
        )
        df_scores.loc[mask, "surprise"] = score

# 5) Save back
df_scores.to_csv(SCORES_CSV, index=False)
print(f"Updated {SCORES_CSV} with surprise scores")

In [None]:
# -----------------------------------------------------------------------------
# Preliminary Checks
# -----------------------------------------------------------------------------

# Load data
df = pd.read_csv('scores.csv')   # ← corrected path

metrics = ['lexical_diversity', 'semantic_diversity', 'novelty', 'surprise']
sources = df['source'].unique()

# 1. Descriptive Statistics
desc = df.groupby('source')[metrics].agg(['mean', 'std']).round(4)
print("Descriptive Statistics (mean ± std):")
print(desc)

# 2. Normality Tests (Shapiro–Wilk)
print("\nShapiro–Wilk Normality Tests:")
for metric in metrics:
    for source in sources:
        data = df[df['source'] == source][metric].dropna()
        stat, p = shapiro(data)
        print(f"{source} – {metric}: W={stat:.4f}, p={p:.4f}")

# 3. Homogeneity of Variance (Levene’s Test)
print("\nLevene’s Test for Equal Variances:")
for metric in metrics:
    g1 = df[df['source'] == sources[0]][metric].dropna()
    g2 = df[df['source'] == sources[1]][metric].dropna()
    stat, p = levene(g1, g2)
    print(f"{metric}: W={stat:.4f}, p={p:.4f}")

# 4. Correlation Matrix
corr = df[metrics].corr()
print("\nCorrelation Matrix:")
print(corr)

# 5. Q–Q Plots for Normality by Group
nltk.download('punkt', quiet=True)
for metric in metrics:
    plt.figure(figsize=(8,4))
    for i, source in enumerate(sources, 1):
        plt.subplot(1, 2, i)
        probplot(df[df['source']==source][metric].dropna(), dist="norm", plot=plt)
        plt.title(f"Q–Q: {metric} ({source})")
    plt.tight_layout()
    plt.show()

# 6. Boxplots by Group
for metric in metrics:
    plt.figure()
    data = [df[df['source']==s][metric].dropna() for s in sources]
    plt.boxplot(data, labels=sources)
    plt.title(f"Boxplot of {metric}")
    plt.ylabel(metric)
    plt.show()

# 7. Correlation Heatmap
plt.figure(figsize=(5,4))
im = plt.imshow(corr, interpolation='nearest')
plt.colorbar(im)
plt.xticks(np.arange(len(metrics)), metrics, rotation=45)
plt.yticks(np.arange(len(metrics)), metrics)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()

In [None]:
df = pd.read_csv("scores.csv")

temperatures = 'temperature'
metric_columns = ['lexical_diversity', 'semantic_diversity', 'novelty', 'surprise']
sources = df['source'].unique()

output_folder = "statplots"
os.makedirs(output_folder, exist_ok=True)


#Permutation Test Function:
def permutation_test(x, y, n_permutations=10000):
    observed_diff = np.mean(x) - np.mean(y)
    combined = np.concatenate([x, y])
    count = 0
    for _ in range(n_permutations):
        np.random.shuffle(combined)
        new_x = combined[:len(x)]
        new_y = combined[len(x):]
        diff = np.mean(new_x) - np.mean(new_y)
        if abs(diff) >= abs(observed_diff):
            count += 1
    p_val = count / n_permutations
    return observed_diff, p_val  #SO: the results we see report the observed difference between the two groups and the p-value

sns.set(style="whitegrid")

for temperature in sorted(df[temperatures].dropna().unique()):
  if temperature != 'human':
    for metric in metric_columns:
        human_scores = df[df['source'] == 'human'][metric].dropna().values
        llm_scores = df[(df['source'] == 'gpt-4.1') & (df['temperature'] == temperature)][metric].dropna().values

        #Mann–Whitney U Test
        u_stat, u_p = mannwhitneyu(human_scores, llm_scores, alternative='two-sided')

        #Permutation Test
        perm_diff, perm_p = permutation_test(human_scores, llm_scores)

        print(f"\n===== {metric.upper()} at temperature level {temperature}=====")
        print(f"Mann–Whitney U: U = {u_stat:.2f}, p = {u_p:.12f}") #the .10f defines the amount of decimal positions we want to include. I manipulated it because of the p value of the semantic sim, which is super super small.
        print(f"Human median: {np.median(human_scores):.3f}")
        print(f"LLM median: {np.median(llm_scores):.3f}")
        print(f"Permutation test: Mean diff = {perm_diff:.4f}, p = {perm_p:.12f}")

  #Plot:
        df_plot = pd.DataFrame({
            'source': ['Human'] * len(human_scores) + [f'LLM (T={temperature})'] * len(llm_scores),
            'score': np.concatenate([human_scores, llm_scores])
        })
        plt.figure(figsize=(6, 4))
        sns.boxplot(data=df_plot, x='source', y='score', palette='Set2')
        sns.stripplot(data=df_plot, x='source', y='score', color='black', alpha=0.5)
        plt.title(f'{metric.replace("_", " ").title()} Score\nHuman vs LLM (T={temperature})')
        plt.ylabel(f'{metric.replace("_", " ").title()}')
        plt.xlabel('Agent Type')
        plt.tight_layout()

        plot_filename = os.path.join(output_folder, f"{metric}_T{temperature}_comparison.png")
        plt.savefig(plot_filename)
        plt.close()
    else:
      pass

# Create dictionary to store p-values for each metric
pval_tables = {metric: [] for metric in metric_columns}

for temperature in sorted(df[temperatures].dropna().unique()):
    if temperature != 'human':
        for metric in metric_columns:
            human_scores = df[df['source'] == 'human'][metric].dropna().values
            llm_scores = df[(df['source'] == 'gpt-4.1') & (df['temperature'] == temperature)][metric].dropna().values

            u_stat, u_p = mannwhitneyu(human_scores, llm_scores, alternative='two-sided')
            perm_diff, perm_p = permutation_test(human_scores, llm_scores)

            pval_tables[metric].append({
                'Temperature': f'T={temperature}',
                'Mann–Whitney U p-value': u_p,
                'Permutation Test p-value': perm_p
            })

# Create and save tables as plots
for metric, rows in pval_tables.items():
    table_df = pd.DataFrame(rows)

    fig, ax = plt.subplots(figsize=(8, 0.5 + 0.4 * len(table_df)))  # dynamic height
    ax.axis('off')
    table = plt.table(
        cellText=np.round(table_df.iloc[:, 1:].values, 12),
        rowLabels=table_df['Temperature'],
        colLabels=table_df.columns[1:],
        loc='center',
        cellLoc='center'
    )
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 1.5)

    plt.title(f'{metric.replace("_", " ").title()} – P-Values vs Human')
    plt.tight_layout()

    table_filename = os.path.join(output_folder, f"{metric}_pvalue_table.png")
    plt.savefig(table_filename)
    plt.close()


#summary plots
for metric in metric_columns:
    summary_data = []

    for temperature in sorted(df[temperatures].dropna().unique()):
        llm_scores = df[(df['source'] == 'gpt-4.1') & (df['temperature'] == temperature)][metric].dropna().values
        summary_data.extend(zip([f'T={temperature}'] * len(llm_scores), llm_scores))

    human_scores = df[df['source'] == 'human'][metric].dropna().values
    summary_data.extend(zip(['Human'] * len(human_scores), human_scores))

    summary_df = pd.DataFrame(summary_data, columns=['Source', 'Score'])

    plt.figure(figsize=(10, 5))
    sns.boxplot(data=summary_df, x='Source', y='Score', palette='Set2', showfliers=False)
    sns.stripplot(data=summary_df, x='Source', y='Score', color='black', alpha=0.5, jitter=True)
    plt.title(f'{metric.replace("_", " ").title()} Score\nAcross Temperatures + Human')
    plt.ylabel(metric.replace("_", " ").title())
    plt.xlabel('Agent / Temperature')
    plt.tight_layout()

    summary_filename = os.path.join(output_folder, f"{metric}_summary_plot.png")
    plt.savefig(summary_filename)
    plt.close()