# CRS Data Checks
Quick stats for CSVs and folders of cleaned CRS data.

- Tokens in a single CSV (sum; uses tiktoken if available, else words)
- Number of reports (rows) in a folder of clean_*.csv
- Number above a minimum word threshold
- Total token estimate across the folder


In [4]:
import os, csv, glob, json
from typing import List

# Paths and knobs — edit these:
csv_path = '../data/firstN/ai_first_10000__gemini-2.5-flash-lite.csv'  # a single CSV
folder = '../data/1'                                 # folder with clean_*.csv files
min_words = 3000                                   # threshold for 'above min'
tokenizer_hint = 'gpt-4o-mini'                    # tiktoken model hint if installed

# Try to import tiktoken (optional)
try:
    import tiktoken  # type: ignore
except Exception:
    tiktoken = None

def get_tokenizer(model_hint: str):
    if tiktoken is not None:
        try:
            enc = tiktoken.encoding_for_model(model_hint)
            return lambda s: len(enc.encode(s or ''))
        except Exception:
            try:
                enc = tiktoken.get_encoding('cl100k_base')
                return lambda s: len(enc.encode(s or ''))
            except Exception:
                pass
    return lambda s: len((s or '').split())

tok = get_tokenizer(tokenizer_hint)


## Tokens in a single CSV

In [5]:
single_tokens = 0
single_rows = 0
if os.path.isfile(csv_path):
    with open(csv_path, 'r', encoding='utf-8', newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            single_rows += 1
            text = row.get('text') or ''
            # if token_est available, prefer it; else compute
            token_est = row.get('token_est')
            try:
                single_tokens += int(token_est) if token_est is not None else tok(text)
            except Exception:
                single_tokens += tok(text)
else:
    print(f'CSV not found: {csv_path}')

print(json.dumps({'csv_path': csv_path, 'rows': single_rows, 'tokens': int(single_tokens)}, indent=2))


{
  "csv_path": "../data/firstN/ai_first_10000__gemini-2.5-flash-lite.csv",
  "rows": 1,
  "tokens": 10000
}


## Folder stats (clean_*.csv)

In [6]:
folder_rows = 0
folder_above = 3000
folder_tokens = 0
file_breakdown: List[dict] = []
paths = sorted(glob.glob(os.path.join(folder, 'clean_*.csv')))
for p in paths:
    rows = 0
    toks = 0
    above = 0
    with open(p, 'r', encoding='utf-8', newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            rows += 1
            text = row.get('text') or ''
            wc = len(text.split())
            if wc >= int(min_words):
                above += 1
            token_est = row.get('token_est')
            try:
                toks += int(token_est) if token_est is not None else tok(text)
            except Exception:
                toks += tok(text)
    folder_rows += rows
    folder_above += above
    folder_tokens += toks
    file_breakdown.append({'file': p, 'rows': rows, 'above_min': above, 'tokens': int(toks)})

summary = {
    'folder': folder,
    'files': len(paths),
    'rows': folder_rows,
    'above_min': folder_above,
    'tokens': int(folder_tokens),
}
print(json.dumps(summary, indent=2))
print('Per-file breakdown (first 10):')
for rec in file_breakdown[:10]:
    print(json.dumps(rec, indent=2))


{
  "folder": "../data/1",
  "files": 8,
  "rows": 263,
  "above_min": 3216,
  "tokens": 1518830
}
Per-file breakdown (first 10):
{
  "file": "../data/1/clean_0.csv",
  "rows": 2,
  "above_min": 2,
  "tokens": 17783
}
{
  "file": "../data/1/clean_1.csv",
  "rows": 50,
  "above_min": 3,
  "tokens": 14499
}
{
  "file": "../data/1/clean_17.csv",
  "rows": 26,
  "above_min": 26,
  "tokens": 189842
}
{
  "file": "../data/1/clean_30.csv",
  "rows": 47,
  "above_min": 47,
  "tokens": 339773
}
{
  "file": "../data/1/clean_31.csv",
  "rows": 50,
  "above_min": 50,
  "tokens": 365087
}
{
  "file": "../data/1/clean_41.csv",
  "rows": 15,
  "above_min": 15,
  "tokens": 100675
}
{
  "file": "../data/1/clean_50.csv",
  "rows": 30,
  "above_min": 30,
  "tokens": 200225
}
{
  "file": "../data/1/clean_57.csv",
  "rows": 43,
  "above_min": 43,
  "tokens": 290946
}
