In [None]:
import json, textwrap
from pathlib import Path

import pandas as pd
import numpy as np

PROMPTS_DIR = Path.home() / 'Desktop' / 'mccviahat' / 'prompts'
DATA_DIR    = Path.home() / 'Desktop' / 'mccviahat' / 'data'
PROMPTS_DIR.mkdir(exist_ok=True)
print(f'Output dir: {PROMPTS_DIR}')

## 1 — Neutral prompts from WikiHow

Load the HuggingFace dataset, keep only English rows where the response
is under 500 tokens (estimated as `len(text.split())` × 1.3), then
randomly sample 5 and format as the expected JSON.

In [None]:
# Load WikiHow dataset from HuggingFace
df_wiki = pd.read_parquet(
    "hf://datasets/0x22almostEvil/multilingual-wikihow-qa-16k/data/train-00000-of-00001-0bdf6bc5b4b507e0.parquet"
)
print(f'Raw dataset: {len(df_wiki):,} rows')
print(f'Columns: {list(df_wiki.columns)}')
df_wiki.head(3)

In [None]:
# Inspect columns to find the right text fields
for col in df_wiki.columns:
    sample = df_wiki[col].dropna().iloc[0] if df_wiki[col].notna().any() else '(all NaN)'
    preview = str(sample)[:120]
    print(f'{col:30s}  {preview}')

In [None]:
# Estimate token count (rough: words × 1.3 for subword tokenisation)
# Adjust the column name below after inspecting the dataset
# Common names: 'response', 'answer', 'text', 'RESPONSE'

# Auto-detect the response column
response_col = None
for candidate in ['response', 'RESPONSE', 'answer', 'ANSWER', 'text']:
    if candidate in df_wiki.columns:
        response_col = candidate
        break
if response_col is None:
    # Fall back to the last string column
    str_cols = [c for c in df_wiki.columns if df_wiki[c].dtype == 'object']
    response_col = str_cols[-1] if str_cols else df_wiki.columns[-1]
print(f'Using response column: "{response_col}"')

# Auto-detect the title/question column
title_col = None
for candidate in ['question', 'QUESTION', 'title', 'TITLE', 'topic']:
    if candidate in df_wiki.columns:
        title_col = candidate
        break
if title_col is None:
    str_cols = [c for c in df_wiki.columns if df_wiki[c].dtype == 'object']
    title_col = str_cols[0] if str_cols else df_wiki.columns[0]
print(f'Using title column:    "{title_col}"')

# Filter: English-only if there's a language column
lang_col = None
for candidate in ['language', 'lang', 'LANGUAGE']:
    if candidate in df_wiki.columns:
        lang_col = candidate
        break

if lang_col:
    df_wiki = df_wiki[df_wiki[lang_col].str.lower().str.startswith('en')].copy()
    print(f'After English filter: {len(df_wiki):,} rows')

# Estimate tokens and filter
df_wiki['_est_tokens'] = df_wiki[response_col].fillna('').apply(lambda x: int(len(str(x).split()) * 1.3))
df_wiki_short = df_wiki[df_wiki['_est_tokens'] < 500].copy()
print(f'After <500 token filter: {len(df_wiki_short):,} rows')
print(f'Token range: {df_wiki_short["_est_tokens"].min()}–{df_wiki_short["_est_tokens"].max()}')

In [None]:
# Sample 5 prompts (fix seed for reproducibility)
SEED = 42
wiki_sample = df_wiki_short.sample(n=5, random_state=SEED).reset_index(drop=True)

neutral_prompts = []
for i, row in wiki_sample.iterrows():
    neutral_prompts.append({
        'id': i + 1,
        'title': str(row[title_col]).strip()[:80],
        'instructions': str(row[response_col]).strip(),
    })

# Preview
for p in neutral_prompts:
    est_tok = int(len(p['instructions'].split()) * 1.3)
    print(f"  [{p['id']}] {p['title'][:60]:60s}  ~{est_tok} tokens  ({len(p['instructions'])} chars)")

In [None]:
# Save neutral prompts
neutral_path = PROMPTS_DIR / 'neutral_wikihow.json'
neutral_path.write_text(json.dumps(neutral_prompts, indent=2, ensure_ascii=False), encoding='utf-8')
print(f'Saved {len(neutral_prompts)} neutral prompts → {neutral_path}')

## 2 — Emotional prompts from Creepypasta

Load the Excel file, filter to stories where `estimated_reading_time < 5` minutes,
then sample 5 and format.

In [None]:
# Load creepypasta dataset
xlsx_path = DATA_DIR / 'creepypastas.xlsx'
assert xlsx_path.exists(), f'Missing: {xlsx_path}'

df_creepy = pd.read_excel(xlsx_path)
print(f'Raw creepypasta rows: {len(df_creepy):,}')
print(f'Columns: {list(df_creepy.columns)}')
df_creepy.head(3)

In [None]:
# Inspect columns
for col in df_creepy.columns:
    sample = df_creepy[col].dropna().iloc[0] if df_creepy[col].notna().any() else '(all NaN)'
    preview = str(sample)[:120]
    print(f'{col:35s}  {preview}')

In [None]:
# Find the reading time column (case-insensitive search)
time_col = None
for c in df_creepy.columns:
    if 'reading_time' in c.lower() or 'readingtime' in c.lower() or 'reading time' in c.lower():
        time_col = c
        break
if time_col is None:
    # Try partial match
    for c in df_creepy.columns:
        if 'time' in c.lower() and 'read' in c.lower():
            time_col = c
            break
print(f'Reading time column: "{time_col}"')
print(f'Sample values: {df_creepy[time_col].head(10).tolist()}')

# The column might be a string like "3 min" or a number — handle both
if df_creepy[time_col].dtype == 'object':
    # Extract numeric part
    df_creepy['_reading_min'] = df_creepy[time_col].str.extract(r'(\d+\.?\d*)').astype(float)
else:
    df_creepy['_reading_min'] = df_creepy[time_col].astype(float)

print(f'Reading time range: {df_creepy["_reading_min"].min():.1f}–{df_creepy["_reading_min"].max():.1f} min')

In [None]:
# Filter: estimated_reading_time < 5 minutes
df_creepy_short = df_creepy[df_creepy['_reading_min'] < 5].copy()
print(f'After <5 min filter: {len(df_creepy_short):,} rows')

# Auto-detect title and body columns
creepy_title_col = None
for candidate in ['title', 'Title', 'TITLE', 'name', 'Name']:
    if candidate in df_creepy.columns:
        creepy_title_col = candidate
        break
if creepy_title_col is None:
    str_cols = [c for c in df_creepy.columns if df_creepy[c].dtype == 'object']
    creepy_title_col = str_cols[0]

creepy_body_col = None
for candidate in ['body', 'Body', 'text', 'Text', 'story', 'Story', 'content', 'Content']:
    if candidate in df_creepy.columns:
        creepy_body_col = candidate
        break
if creepy_body_col is None:
    # Pick the string column with the longest average length
    str_cols = [c for c in df_creepy.columns if df_creepy[c].dtype == 'object']
    creepy_body_col = max(str_cols, key=lambda c: df_creepy[c].str.len().mean())

print(f'Title column: "{creepy_title_col}"')
print(f'Body column:  "{creepy_body_col}"')

# Estimate tokens for the body text
df_creepy_short['_est_tokens'] = df_creepy_short[creepy_body_col].fillna('').apply(
    lambda x: int(len(str(x).split()) * 1.3)
)
print(f'Token range: {df_creepy_short["_est_tokens"].min()}–{df_creepy_short["_est_tokens"].max()}')

In [None]:
# Sample 5 creepypasta prompts
creepy_sample = df_creepy_short.sample(n=5, random_state=SEED).reset_index(drop=True)

emotional_prompts = []
for i, row in creepy_sample.iterrows():
    emotional_prompts.append({
        'id': i + 1,
        'title': str(row[creepy_title_col]).strip()[:80],
        'instructions': str(row[creepy_body_col]).strip(),
    })

# Preview
for p in emotional_prompts:
    est_tok = int(len(p['instructions'].split()) * 1.3)
    print(f"  [{p['id']}] {p['title'][:60]:60s}  ~{est_tok} tokens  ({len(p['instructions'])} chars)")

In [None]:
# Save emotional prompts
emotional_path = PROMPTS_DIR / 'emotional_creepypasta.json'
emotional_path.write_text(json.dumps(emotional_prompts, indent=2, ensure_ascii=False), encoding='utf-8')
print(f'Saved {len(emotional_prompts)} emotional prompts → {emotional_path}')

## 3 — Summary & Token-Count Comparison

Side-by-side comparison to check that the two sets are roughly matched
in length — important for reducing confounders in the HAT experiment.

In [None]:
import matplotlib.pyplot as plt

n_toks = [int(len(p['instructions'].split()) * 1.3) for p in neutral_prompts]
e_toks = [int(len(p['instructions'].split()) * 1.3) for p in emotional_prompts]

print(f'{"":20s}  {"Neutral (WikiHow)":>20s}  {"Emotional (Creepypasta)":>24s}')
print('-' * 70)
print(f'{"Mean tokens":20s}  {np.mean(n_toks):20.0f}  {np.mean(e_toks):24.0f}')
print(f'{"Std tokens":20s}  {np.std(n_toks):20.0f}  {np.std(e_toks):24.0f}')
print(f'{"Min tokens":20s}  {np.min(n_toks):20d}  {np.min(e_toks):24d}')
print(f'{"Max tokens":20s}  {np.max(n_toks):20d}  {np.max(e_toks):24d}')

fig, ax = plt.subplots(figsize=(8, 4))
x = np.arange(5)
w = 0.35
ax.bar(x - w/2, n_toks, w, label='Neutral (WikiHow)', color='steelblue', alpha=0.8)
ax.bar(x + w/2, e_toks, w, label='Emotional (Creepypasta)', color='firebrick', alpha=0.8)
ax.set_xticks(x)
ax.set_xticklabels([f'Prompt {i+1}' for i in range(5)])
ax.set_ylabel('Estimated tokens')
ax.set_title('Token Count Comparison: Neutral vs Emotional')
ax.legend()
plt.tight_layout()
plt.show()

ratio = np.mean(e_toks) / np.mean(n_toks)
print(f'\nMean ratio (emotional/neutral): {ratio:.2f}')
if 0.8 <= ratio <= 1.2:
    print('✓ Conditions are well-matched in token count')
else:
    print(f'⚠ Token counts differ by {abs(ratio-1)*100:.0f}% — consider re-sampling or trimming')

## Output files

| File | Description |
|------|-------------|
| `prompts/neutral_wikihow.json` | 5 WikiHow how-to instructions (<500 tokens each) |
| `prompts/emotional_creepypasta.json` | 5 creepypasta stories (<5 min reading time) |

Usage:
```bash
python3 scripts/run_prompts_json.py --json prompts/neutral_wikihow.json --label neutral
python3 scripts/run_prompts_json.py --json prompts/emotional_creepypasta.json --label emotional
```