# Anthropic Interviewer dataset

Quickstart notebook to pull the transcripts from the Hugging Face dataset and do light inspection.
            


**Dataset:** `Anthropic/AnthropicInterviewer` on Hugging Face.
- Interview transcripts from 1,250 professionals (workforce=1,000, creatives=125, scientists=125).
- Data is CC-BY; code MIT. Public dataset, so no auth token needed for reading.

Run the install cell once per environment, then execute the rest.
            


In [None]:
%pip install -q pandas huggingface_hub openai scikit-learn


In [None]:
import pandas as pd
from pathlib import Path

SPLITS = {
    "workforce": "interview_transcripts/workforce_transcripts.csv",
    "creatives": "interview_transcripts/creatives_transcripts.csv",
    "scientists": "interview_transcripts/scientists_transcripts.csv",
}
BASE_PATH = "hf://datasets/Anthropic/AnthropicInterviewer/"

def load_split(name: str) -> pd.DataFrame:
    path = BASE_PATH + SPLITS[name]
    df = pd.read_csv(path)
    df["split"] = name
    return df

dfs = {name: load_split(name) for name in SPLITS}
for name, df in dfs.items():
    cols = ", ".join(df.columns)
    print(f"{name:10} {df.shape[0]:4} rows | columns: {cols}")
            


In [None]:
# Quick look at the workforce split
dfs["workforce"].head()
            


In [None]:
# Sample rows across all splits
all_df = pd.concat(dfs.values(), ignore_index=True)
all_df.sample(5, random_state=42)[["transcript_id", "split", "text"]]
            


In [None]:
# Rough length stats by split (character count of transcript text)
all_df = all_df.copy()
all_df["text_length"] = all_df["text"].str.len()
all_df.groupby("split")["text_length"].describe()[["count", "mean", "min", "max"]]
            


### Per-split descriptive stats
Add word-level and character-level summaries to see distribution differences per group.
    


In [None]:
# Word-level descriptive stats by split
all_df = pd.concat(dfs.values(), ignore_index=True)
all_df = all_df.assign(
    word_count=all_df["text"].str.split().str.len(),
    char_count=all_df["text"].str.len(),
)
summary = (
    all_df.groupby("split")[["word_count", "char_count"]]
    .agg(["count", "mean", "median", "min", "max"])
    .round(2)
)
summary
    


### Top keywords per group (TF-IDF)
Rough sense of distinctive vocabulary by group. Adjust `top_n` or stop words as needed.
    


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

top_n = 15
results = {}
for split, df in dfs.items():
    vec = TfidfVectorizer(stop_words="english", max_features=5000)
    matrix = vec.fit_transform(df["text"])
    scores = matrix.sum(axis=0).A1
    terms = vec.get_feature_names_out()
    order = scores.argsort()[::-1][:top_n]
    results[split] = [(terms[i], float(scores[i])) for i in order]

for split, items in results.items():
    print(f"\n{split.title()} top {top_n} tf-idf terms:")
    for term, score in items:
        print(f"  {term:20s} {score:.2f}")


### LLM themes per group
Set `OPENAI_API_KEY` in your environment. The cell below samples transcripts per split and asks a stronger model (default `gpt-4o`) for 5 themes with supporting evidence.
            


In [None]:
from openai import OpenAI
import os

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Set OPENAI_API_KEY in your environment before running this cell.")

client = OpenAI(api_key=api_key)
separator = "\n\n---\n\n"

def summarize_split(split: str, sample_size: int = 8, model: str = "gpt-4o") -> str:
    subset = dfs[split].sample(sample_size, random_state=42)["text"].tolist()
    prompt = f"""
You are analyzing qualitative interview transcripts from the {split} group.
Extract 5 themes. For each theme, provide a short label and 1-2 bullet examples grounded in the text.
Return concise markdown.

Transcripts (each separated by ---):
{separator.join(subset)}
"""
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
    )
    return response.choices[0].message.content

for split in ["workforce", "creatives", "scientists"]:
    print(f"\n### {split.title()} themes\n")
    print(summarize_split(split))


### Save per-group LLM themes to markdown
Writes the generated themes to `analysis/llm_group_analysis.md` for easy reference.


In [None]:
from pathlib import Path

output_dir = Path("analysis")
output_dir.mkdir(exist_ok=True)
out_path = output_dir / "llm_group_analysis.md"

sections = []
for split in ["workforce", "creatives", "scientists"]:
    sections.append(f"## {split.title()} themes\n")
    sections.append(summarize_split(split))

content = "\n\n".join(sections)
out_path.write_text(content)
print(f"Wrote {out_path} ({len(content)} chars)")


In [None]:
# Optional: persist the three splits locally in data/
output_dir = Path("data")
output_dir.mkdir(exist_ok=True)
for name, df in dfs.items():
    dest = output_dir / f"{name}_transcripts.csv"
    df.to_csv(dest, index=False)
    print(f"Wrote {dest}")
            
