In [3]:
#############################
# Parameters ‚Äì tweak freely #
#############################
GENERATE = True   # ‚á† set False if you already have creative_writing_generations.jsonl
THREADS  = 80
MAX_PROMPTS = 1000

OUT_JSONL     = 'creative_writing_generations.jsonl'
HUMAN_PROFILE = 'data/human_writing_profile.json'

import json, subprocess, sys, math, re, itertools, collections, os, pathlib
from pathlib import Path
from collections import Counter
import pandas as pd


In [4]:
if GENERATE:
    cmd = [
        'python3', 'main.py',
        '--output-jsonl', OUT_JSONL,
        '--input-hf-dataset', 'Nitral-AI/Reddit-SFW-Writing_Prompts_ShareGPT',
        '--hf-dataset-split', 'train',
        '--threads', str(THREADS),
        '--max-prompts', str(MAX_PROMPTS),
        '--logging-level', 'INFO',
        #'--regex-blocklist-file', 'regex_not_x_but_y.json',
    ]
    print('Running:', ' '.join(cmd))
    subprocess.run(cmd, check=True)


Running: python3 main.py --output-jsonl creative_writing_generations.jsonl --input-hf-dataset Nitral-AI/Reddit-SFW-Writing_Prompts_ShareGPT --hf-dataset-split train --threads 80 --max-prompts 1000 --logging-level INFO
INFO mode: Progress bar and ban events will be printed. Most logs suppressed. Effective script level: INFO
Extracting HF prompts:   1%|          | 999/177477 [00:00<00:03, 48943.02prompt/s]
Preparing to process 1000 new prompts in this run.
Batch Generating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [15:58<00:00,  1.04prompt/s, 1527.2 tok/s] 
Finished processing 1000 prompts in this run in 958.57s.
Overall average throughput for this run: 1527.21 tok/s.
Results appended to creative_writing_generations.jsonl


In [1]:
# ---------------------------------------------------------------------------
#  üîç 1.  LOAD MATERIALS
# ---------------------------------------------------------------------------
from pathlib import Path
import json
from collections import Counter, defaultdict
import math

import nltk
# Ensure nltk punkt tokenizer is downloaded for nltk.word_tokenize
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    print("NLTK 'punkt' tokenizer not found. Downloading...")
    nltk.download('punkt', quiet=True)
    print("'punkt' tokenizer downloaded.")
except Exception as e:
    print(f"Warning: Could not automatically verify/download NLTK 'punkt' tokenizer: {e}. "
          "Ensure it is installed for nltk.word_tokenize to function correctly.")

from nltk import ngrams
from nltk.corpus import stopwords
from slop_forensics.utils import load_jsonl_file, normalize_text, extract_words
#from slop_forensics.analysis import STOP_WORDS          # already initialised in analysis.py

try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    STOP_WORDS = set(stopwords.words('english'))
    print(f"Loaded {len(STOP_WORDS)} NLTK stopwords for 'english'.")
except LookupError:
    print(f"NLTK 'punkt' or 'stopwords' not found. Run nltk.download('punkt') and nltk.download('stopwords').")
    STOP_WORDS = set()
except ImportError:
    print("NLTK not installed. Stopword filtering will be skipped.")
    STOP_WORDS = set()

# ---------------------------------------------------------------------------
#  üîß 2.  CONFIGURE PATHS / PARAMS
# ---------------------------------------------------------------------------
# Your freshly-generated LLM outputs               (adjust if you changed the dir or name pattern)
GENERATED_FILE = "creative_writing_generations.jsonl"

# A ‚Äúhuman baseline‚Äù frequency file (whatever name/location you saved it under)
HUMAN_PROFILE_FILE = Path("data") / "human_writing_profile.json"

TOP_K_WORDS      = 2_000      # how many over-represented unigrams to keep
TOP_K_BIGRAMS    = 1_000
TOP_K_TRIGRAMS   = 1_000
MIN_WORD_LEN     = 4          # keep short contractions like "it's" even if <4
FREQ_NORM_DENOM  = 100_000    # chars ‚Üí ‚Äúper 100 K characters‚Äù

# ---------------------------------------------------------------------------
#  üì• 3.  PULL IN THE DATA
# ---------------------------------------------------------------------------
gen_rows  = load_jsonl_file(str(GENERATED_FILE))
gen_texts = [row["generation"] for row in gen_rows if isinstance(row.get("generation"), str)]

if not gen_texts:
    raise ValueError(f"No usable text in {GENERATED_FILE}")

with HUMAN_PROFILE_FILE.open("r", encoding="utf-8") as f:
    human_profile_full = json.load(f)
    human_profile = human_profile_full.get('human-authored')
    if not human_profile:
        raise ValueError(f"Key 'human-authored' not found in {HUMAN_PROFILE_FILE}")


# Helper function to convert the ngram list format from the JSON
# to the dictionary format {normalized_ngram_string: frequency}
# and normalize the ngram string to match LLM ngram processing.
def _convert_and_normalize_human_ngram_list(ngram_list_of_dicts, n_value: int):
    if not isinstance(ngram_list_of_dicts, list):
        print(f"Warning: Expected a list for human {n_value}-grams, got {type(ngram_list_of_dicts)}. Returning empty dict.")
        return {}
    
    converted_dict = {}
    skipped_count = 0
    original_count = len(ngram_list_of_dicts)
    processed_keys = set() # To track keys after normalization

    for item in ngram_list_of_dicts:
        ngram_str = item.get("ngram")
        frequency = item.get("frequency")

        if ngram_str is None or frequency is None:
            skipped_count += 1
            continue

        # Normalize the human ngram string in a way that mirrors LLM token processing for ngrams
        # 1. Apply the same base text normalization
        # 2. Tokenize
        # 3. Lowercase and filter for alphabetic tokens
        # 4. Re-join if the number of tokens matches n_value
        normalized_text_for_human_ngram = normalize_text(str(ngram_str)) # Ensure string
        tokens = [t.lower() for t in nltk.word_tokenize(normalized_text_for_human_ngram) if t.isalpha()]
        
        if len(tokens) == n_value:
            processed_ngram_key = " ".join(tokens)
            # Sum frequencies if different original ngrams normalize to the same key
            converted_dict[processed_ngram_key] = converted_dict.get(processed_ngram_key, 0) + int(frequency)
            processed_keys.add(processed_ngram_key)
        else:
            # This ngram from human profile does not conform to N-word alpha-only structure after processing
            # Example: "amp nbsp" might become ['amp', 'nbsp'] (len 2) or just ['amp'] (len 1)
            # depending on normalize_text and isalpha behavior for "nbsp".
            # If it doesn't result in `n_value` alphabetic tokens, it's skipped.
            skipped_count += 1
            # print(f"Debug: Skipping human {n_value}-gram '{ngram_str}' -> tokens {tokens} (len != {n_value})")


    if skipped_count > 0:
        print(f"INFO: Normalizing human {n_value}-grams: Processed {original_count} items. "
              f"Resulted in {len(converted_dict)} unique normalized {n_value}-gram keys. "
              f"{skipped_count} original items were skipped or merged due to normalization "
              f"(e.g., non-alphabetic content, or length mismatch after tokenization).")
    return converted_dict

human_bigrams_list  = human_profile.get("top_bigrams", [])
human_trigrams_list = human_profile.get("top_trigrams", [])

human_bigrams  = _convert_and_normalize_human_ngram_list(human_bigrams_list, 2)
human_trigrams = _convert_and_normalize_human_ngram_list(human_trigrams_list, 3)

required_keys = ["num_texts_analyzed", "avg_length"]
for key in required_keys:
    if key not in human_profile:
        raise KeyError(
            f"Human profile JSON (under 'human-authored') is missing the required key: '{key}'. "
            f"File: {HUMAN_PROFILE_FILE}"
        )
h_chars_total  = human_profile["num_texts_analyzed"] * human_profile["avg_length"]
if h_chars_total == 0:
    print(f"Warning: Total characters for human data (h_chars_total) is 0. Frequencies per 100k will be 0 or infinite.")


# ---------------------------------------------------------------------------
#  üèóÔ∏è 4.  BUILD WORD COUNTS & N-GRAM COUNTS (LLM OUTPUT)
# ---------------------------------------------------------------------------
# ---- 4-a  unigrams ---------------------------------------------------------
word_counter = Counter()
total_chars  = 0

for txt in gen_texts:
    total_chars += len(txt)
    norm_t = normalize_text(txt)
    word_counter.update(
        w for w in extract_words(norm_t, MIN_WORD_LEN)
        if w not in STOP_WORDS
    )

# 4-b  bigrams / trigrams  ----------------------------------------------
bigram_counter  = Counter()
trigram_counter = Counter()

for txt in gen_texts:
    normalized_llm_text = normalize_text(txt)

    # original ‚Üí tokens = [...]
    tokens_all = [t.lower() for t in nltk.word_tokenize(normalized_llm_text) if t.isalpha()]

    # NEW: drop stop-words and very short tokens (unless you explicitly want contractions like ‚Äúit's‚Äù)
    tokens = [
        tok for tok in tokens_all
        if tok not in STOP_WORDS and (len(tok) >= MIN_WORD_LEN or tok in {"it's"})
    ]

    bigram_counter.update(" ".join(bg) for bg in ngrams(tokens, 2))
    trigram_counter.update(" ".join(tg) for tg in ngrams(tokens, 3))


# ---------------------------------------------------------------------------
#  üìä 5.  NORMALISE ‚ÄúPER 100 000 CHARS‚Äù
# ---------------------------------------------------------------------------
def norm_per_100k(raw_count: int, char_total: float) -> float:
    if char_total == 0: # Avoid division by zero
        return 0.0 if raw_count == 0 else math.inf # Or handle as error
    return (raw_count / char_total) * FREQ_NORM_DENOM

def build_norm_dict(counter: Counter, char_total: float, top_k: int):
    return {
        term: {
            "gen_count": counter[term],
            "gen_freq_per_100k": norm_per_100k(counter[term], char_total)
        }
        for term, _ in counter.most_common(top_k) # Only process top_k generated terms
    }

gen_words_norm   = build_norm_dict(word_counter,     float(total_chars), TOP_K_WORDS)
gen_bigrams_norm = build_norm_dict(bigram_counter,   float(total_chars), TOP_K_BIGRAMS)
gen_trigrams_norm= build_norm_dict(trigram_counter,  float(total_chars), TOP_K_TRIGRAMS)

# ---------------------------------------------------------------------------
#  üîó 6.  MERGE WITH HUMAN PROFILE ‚ûú DICTIONARY VS NON-DICTIONARY SPLIT
# ---------------------------------------------------------------------------
def compare_to_human(gen_norm: dict, human_counts: dict, human_total_chars: float):
    both, gen_only = {}, {}

    for term, data in gen_norm.items(): # Iterate over top_k generated n-grams
        if term in human_counts:
            h_raw_count = human_counts[term]
            h_freq_norm = norm_per_100k(h_raw_count, human_total_chars)
            
            gen_freq = data["gen_freq_per_100k"]
            ratio = math.inf # Default for h_freq_norm == 0 and gen_freq > 0
            if h_freq_norm > 0:
                ratio = gen_freq / h_freq_norm
            elif gen_freq == 0 and h_freq_norm == 0: # Both are zero
                ratio = 1.0 # Or 0.0 or NaN, define based on desired interpretation
                            # 1.0 implies they are equally (non-)frequent.
                            # math.nan might be more semantically correct if gen_freq can be 0 here.
                            # However, gen_freq comes from most_common, so it should be > 0.

            both[term] = {
                **data,
                "human_count":       h_raw_count,
                "human_freq_per_100k": h_freq_norm,
                "freq_ratio_gen/hu": ratio
            }
        else:
            gen_only[term] = {
                **data,
                "human_count": 0,
                "human_freq_per_100k": 0.0,
                "freq_ratio_gen/hu": math.inf # Gen has it, human doesn't (or not in profile)
            }
    return both, gen_only

# Pass h_chars_total to the comparison function
bigrams_dict,  bigrams_nondict  = compare_to_human(gen_bigrams_norm,  human_bigrams, h_chars_total)
trigrams_dict, trigrams_nondict = compare_to_human(gen_trigrams_norm, human_trigrams, h_chars_total)

# ---------------------------------------------------------------------------
#  üì§ 7.  TIDY RESULTS ‚Üí DataFrames (easy to inspect / export)
# ---------------------------------------------------------------------------
import pandas as pd

# Create DataFrames
df_bi_dict   = pd.DataFrame.from_dict(bigrams_dict,  orient="index")
df_bi_nondct = pd.DataFrame.from_dict(bigrams_nondict, orient="index")

df_tri_dict   = pd.DataFrame.from_dict(trigrams_dict,  orient="index")
df_tri_nondct = pd.DataFrame.from_dict(trigrams_nondict, orient="index")

# Sort the "dictionary" DataFrames by 'freq_ratio_gen/hu' descending
# This column exists in *_dict DataFrames.
if not df_bi_dict.empty and "freq_ratio_gen/hu" in df_bi_dict.columns:
    df_bi_dict.sort_values(by="freq_ratio_gen/hu", ascending=False, inplace=True)
if not df_tri_dict.empty and "freq_ratio_gen/hu" in df_tri_dict.columns:
    df_tri_dict.sort_values(by="freq_ratio_gen/hu", ascending=False, inplace=True)

# OPTIONAL:  save to disk
out_dir = Path("results") / "antislop_analysis"
out_dir.mkdir(parents=True, exist_ok=True)

df_bi_dict.to_csv(  out_dir / "bigrams__dictionary_sorted.csv") # Added _sorted to filename
df_bi_nondct.to_csv(out_dir / "bigrams__non_dictionary.csv")
df_tri_dict.to_csv( out_dir / "trigrams__dictionary_sorted.csv") # Added _sorted to filename
df_tri_nondct.to_csv(out_dir / "trigrams__non_dictionary.csv")

print(f"‚úîÔ∏è  Finished.  CSVs written to {out_dir.resolve()}")

Loaded 198 NLTK stopwords for 'english'.
‚úîÔ∏è  Finished.  CSVs written to /home/sam/code/ai/antislop-api/results/antislop_analysis
