# Anthropic Interviewer dataset

Quickstart notebook to pull the transcripts from the Hugging Face dataset and do light inspection.
            


**Dataset:** `Anthropic/AnthropicInterviewer` on Hugging Face.
- Interview transcripts from 1,250 professionals (workforce=1,000, creatives=125, scientists=125).
- Data is CC-BY; code MIT. Public dataset, so no auth token needed for reading.

Run the install cell once per environment, then execute the rest.
            


In [None]:
%pip install -q pandas huggingface_hub


In [None]:
import pandas as pd
from pathlib import Path

SPLITS = {
    "workforce": "interview_transcripts/workforce_transcripts.csv",
    "creatives": "interview_transcripts/creatives_transcripts.csv",
    "scientists": "interview_transcripts/scientists_transcripts.csv",
}
BASE_PATH = "hf://datasets/Anthropic/AnthropicInterviewer/"

def load_split(name: str) -> pd.DataFrame:
    path = BASE_PATH + SPLITS[name]
    df = pd.read_csv(path)
    df["split"] = name
    return df

dfs = {name: load_split(name) for name in SPLITS}
for name, df in dfs.items():
    cols = ", ".join(df.columns)
    print(f"{name:10} {df.shape[0]:4} rows | columns: {cols}")
            


In [None]:
# Quick look at the workforce split
dfs["workforce"].head()
            


In [None]:
# Sample rows across all splits
all_df = pd.concat(dfs.values(), ignore_index=True)
all_df.sample(5, random_state=42)[["transcript_id", "split", "text"]]
            


In [None]:
# Rough length stats by split (character count of transcript text)
all_df = all_df.copy()
all_df["text_length"] = all_df["text"].str.len()
all_df.groupby("split")["text_length"].describe()[["count", "mean", "min", "max"]]
            


In [None]:
# Optional: persist the three splits locally in data/
output_dir = Path("data")
output_dir.mkdir(exist_ok=True)
for name, df in dfs.items():
    dest = output_dir / f"{name}_transcripts.csv"
    df.to_csv(dest, index=False)
    print(f"Wrote {dest}")
            
