In [15]:
from anthropic import Anthropic
import json
import re
from pprint import pprint
import os
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List
from collections import defaultdict

In [16]:
load_dotenv()
client = Anthropic(
    api_key=os.environ["ANTHROPIC_API_KEY"],
)

class StoryArc(BaseModel):
    label: str
    description: str
    characters: List[str]
    themes: List[str]

class TranscriptAnalysis(BaseModel):
    story_arcs: List[StoryArc] = Field(..., alias='story_arcs')

def extract_json(response):
    json_start = response.index("[")
    json_end = response.rfind("]")
    return json.loads(response[json_start:json_end + 1])

In [17]:
all_story_arcs = []

transcripts_dir = 'transcripts'
files = os.listdir(transcripts_dir)
files.sort()
max_retries = 3
for file_name in files:
    with open(os.path.join(transcripts_dir, file_name), 'r') as file:
        EPISODE_TRANSCRIPT = file.read()
    
    retries = 0
    while retries < max_retries:
        try:

            message = client.messages.create(
            model="claude-3-haiku-20240307",
            max_tokens=1024,
            temperature=0,
            system="You are good at extracting story arcs from transcript of TV episodes. You will be a given a transcript of an episode. Your task is to extract the main story arcs from this episode. A story arc is a narrative thread that runs through part or all of the episode, involving certain characters and exploring certain themes. An arc has a beginning, middle and end. It will often center around a conflict or challenge that the characters need to overcome. \n\nFor example, in an episode of Friends, one story arc might be:\n- Joey and Chandler centers around a humorous fight over a chair. The conflict begins when Chandler leaves the room for a moment and Joey takes his seat. When Chandler returns and demands the seat back, Joey refuses, leading to a series of tit-for-tat pranks between the two. The situation escalates comically when Joey decides to make a point about not sharing by wearing all of Chandler's clothes at once, declaring, \"Look at me, I'm Chandler. Could I BE wearing any more clothes?\" This is done as a form of exaggerated revenge for Chandler hiding Joey's underwear, and it leads to a memorable and funny moment involving physical comedy and slapstick humor.\n\nPlease carefully read through the transcript and identify the main story arcs. For each one, think about:\n<scratchpad>\n- What is the key conflict or challenge driving this arc? How is it set up and how is it resolved?\n- Which characters are most central to this arc? What role do they each play?\n- What deeper themes, if any, are explored through this storyline? \n</scratchpad>\n\nThen, output the story arcs in the following JSON format:\n\n<example>\n[\n  {\n    \"label\": \"Joey and Chandler's Fight\",\n    \"description\": \"Joey and Chandler bicker over a chair leading to escalating pranks including Joey wearing all of Chandler's clothes as an act of revenge.\",\n    \"characters\": [\"Joey\", \"Chandler\"],\n    \"themes\": [\"Friendship\", \"Conflict\", \"Petty revenge\"]\n  },\n  ...\n]\n</example>\n\n \n\nMake sure the story arc objects are printed in valid JSON format. Only include the valid JSON and do not include any other text. I just want the JSON from you.",
            messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": f"\n\n<episode_transcript>\n{EPISODE_TRANSCRIPT}\n</episode_transcript>\n\n"
                            }
                        ]
                    }
                ]
            ).content[0].text
            # print(message)

            json_output = extract_json(message)
            if json_output is not None:
                print("JSON Output:")
                pprint(json_output)
                all_story_arcs.extend(json_output)
                break
            else:
                retries += 1
                print(f"Retrying API call... Attempt {retries}/{max_retries}")
        except Exception as e:
            retries += 1
            print(f"An error occurred: {e}. Retrying... Attempt {retries}/{max_retries}")

if all_story_arcs:
    print("First Story Arc:")
    pprint(all_story_arcs[0])
else:
    print("No story arcs were extracted.")

JSON Output:
[{'characters': ['Josh', 'Mary Marsh', 'Al Caldwell', 'Toby', 'C.J.'],
  'description': 'Josh makes controversial remarks about Christian '
                 'conservatives on a TV talk show, angering figures like Mary '
                 'Marsh and Al Caldwell. This leads to a confrontation where '
                 'Josh is pressured to apologize and make concessions, but he '
                 'resists until the President intervenes.',
  'label': "Josh's Controversial TV Appearance",
  'themes': ['Political conflict', 'Freedom of speech', 'Moral values']},
 {'characters': ['President Bartlet', 'Leo', 'Mrs. Landingham'],
  'description': 'The President is injured in a bicycle accident, which '
                 'becomes a source of embarrassment and media attention for '
                 "the White House. This incident highlights the President's "
                 'tendency to act impulsively when upset.',
  'label': "The President's Bicycle Accident",
  'themes': ['Leadershi

In [18]:
transcript_analysis = TranscriptAnalysis(story_arcs=all_story_arcs)
for arc in transcript_analysis.story_arcs:
    print(arc.label)

Josh's Controversial TV Appearance
The President's Bicycle Accident
Sam's Mistaken Identity
Mandy's New Job
Bartlet's Joke Causes Backlash
Sam's Encounter with a Call Girl
Mandy Returns to the White House
Tragedy Strikes as Morris Tolliver Dies
The President's Struggle with Proportional Response
Sam's Relationship with a Call Girl
Charlie Young Joins the White House Staff
Passing the Gun Control Bill
Toby's Stock Market Windfall
Leo's Marital Troubles
Bartlet's Basketball Game and Rivalry with Toby
Josh's Distress Over the NSC Card
Bartlet's Chili Night and Appreciation for His Staff
The President's Daughter Zoey is Threatened
The White House Staff Plays Poker
The Census Amendment Debate
The White House Staff Goes Out for Drinks
Leo's Marital Troubles


In [19]:
for arc in transcript_analysis.story_arcs:
    print(arc.themes)

['Political conflict', 'Freedom of speech', 'Moral values']
['Leadership', 'Vulnerability', 'Impulse control']
['Moral ambiguity', 'Incompetence', 'Fatherhood']
['Changing allegiances', 'Personal/professional conflicts']
['Presidential leadership', 'Media relations', 'Unintended consequences']
['Ethics', 'Morality', 'Redemption']
['Teamwork', 'Interpersonal dynamics', 'Staffing decisions']
['Tragedy', 'Foreign policy', 'Presidential decision-making']
['Use of military force', 'Proportionality in warfare', 'Presidential decision-making']
['Ethics and morality in government', 'Reputation and public perception']
['Opportunity and mentorship', 'Diversity in government']
['Gun control', 'Legislative process', 'Compromise', 'Power dynamics']
['Ethics', 'Perception vs. reality', 'Accountability']
['Work-life balance', 'Marriage', 'Addiction']
['Friendship', 'Rivalry', 'Competitiveness']
['Loyalty', 'Friendship', 'Mortality']
['Teamwork', 'Appreciation', 'Camaraderie']
['Fatherhood', 'Security

In [20]:
class ThemeCluster(BaseModel):
    theme_cluster_name: str
    themes: List[str]

class ThemeClustersAnalysis(BaseModel):
    theme_clusters: List[ThemeCluster]


In [21]:
THEMES = []

for arc in transcript_analysis.story_arcs:
    THEMES.extend(arc.themes)

# Remove duplicates
THEMES = list(set(THEMES))

print(THEMES)

['Impulse control', 'Perception vs. reality', 'Incompetence', 'Compromise', 'Use of military force', 'Reputation and public perception', 'Power dynamics', 'Fatherhood', 'Competitiveness', 'Camaraderie', 'Gun control', 'Presidential leadership', 'Ethics', 'Moral values', 'Marriage', 'Rivalry', 'Freedom of speech', 'Privilege', 'Leadership', 'Personal/professional conflicts', 'Ethics and morality in government', 'Accountability', 'Moral ambiguity', 'Interpersonal dynamics', 'Legislative process', 'Diversity in government', 'Friendship', 'Opportunity and mentorship', 'Media relations', 'Security', 'Policymaking', 'Presidential decision-making', 'Bipartisanship', 'Addiction', 'Responsibility', 'Redemption', 'Teamwork', 'Morality', 'Tragedy', 'Loyalty', 'Foreign policy', 'Proportionality in warfare', 'Political conflict', 'Appreciation', 'Changing allegiances', 'Vulnerability', 'Mortality', 'Work-life balance', 'Constitutional interpretation', 'Staffing decisions', 'Unintended consequences'

In [23]:
clustering_response = client.messages.create(
    model="claude-3-haiku-20240307",
    max_tokens=1000,
    temperature=0,
    system="Your task is to group these themes into clusters based on their semantic similarity. Themes that are closely related in meaning should be grouped together.\n\n<scratchpad>\nTo cluster the themes:\n1. Read through the full list of themes to get an overall sense of the topics covered\n2. Identify major thematic categories that multiple themes fall under\n3. Go through the list and assign each theme to the most relevant category \n4. For any themes that don't fit well into the main categories, consider if they warrant their own category or are outliers\n5. Review the categories and their assigned themes to assess if the groupings make sense\n6. Come up with concise, descriptive names for each finalized category\n</scratchpad>\n\nPlease return your output as a JSON array, with each object in the array containing:\n- theme_cluster_name (string): A descriptive name for the cluster of themes \n- themes (list of strings): The themes that have been grouped into that cluster\n\nEnclose your final JSON output inside <result> tags.",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"Here is a list of themes extracted from story arcs of TV series episodes:\n\n<themes>\n{THEMES}\n</themes>"
                }
            ]
        }
    ]
).content[0].text
clustering_json = extract_json(clustering_response)

In [24]:
pprint(clustering_json)
theme_clusters_analysis = ThemeClustersAnalysis(theme_clusters=clustering_json)

for cluster in theme_clusters_analysis.theme_clusters:
    print(cluster.theme_cluster_name)

[{'theme_cluster_name': 'Government and Politics',
  'themes': ['Use of military force',
             'Reputation and public perception',
             'Power dynamics',
             'Presidential leadership',
             'Ethics and morality in government',
             'Accountability',
             'Legislative process',
             'Diversity in government',
             'Media relations',
             'Policymaking',
             'Presidential decision-making',
             'Bipartisanship',
             'Foreign policy',
             'Proportionality in warfare',
             'Political conflict',
             'Constitutional interpretation',
             'Staffing decisions']},
 {'theme_cluster_name': 'Interpersonal Dynamics',
  'themes': ['Compromise',
             'Camaraderie',
             'Competitiveness',
             'Rivalry',
             'Interpersonal dynamics',
             'Friendship',
             'Opportunity and mentorship',
             'Teamwork',
          

In [25]:
# Step 1: Map each theme to its corresponding cluster
theme_to_cluster = {}
for cluster in theme_clusters_analysis.theme_clusters:
    for theme in cluster.themes:
        theme_to_cluster[theme] = cluster.theme_cluster_name

# Step 2: Create a mapping of characters to the themes present in the arcs they are involved in
character_to_themes = defaultdict(list)
for arc in transcript_analysis.story_arcs:
    for character in arc.characters:
        character_to_themes[character].extend(arc.themes)

# Step 3: Determine the most common theme cluster for each character
character_to_theme_cluster = {}
for character, themes in character_to_themes.items():
    # Count the occurrences of each cluster for the current character
    cluster_count = defaultdict(int)
    for theme in themes:
        if theme in theme_to_cluster:
            cluster_count[theme_to_cluster[theme]] += 1

    # Find the most common cluster for the current character
    if cluster_count:
        character_to_theme_cluster[character] = max(cluster_count, key=cluster_count.get)

for character, cluster_name in character_to_theme_cluster.items():
    print(f"{character}: {cluster_name}")

Josh: Interpersonal Dynamics
Mary Marsh: Government and Politics
Al Caldwell: Government and Politics
Toby: Government and Politics
C.J.: Interpersonal Dynamics
President Bartlet: Government and Politics
Leo: Interpersonal Dynamics
Mrs. Landingham: Interpersonal Dynamics
Sam: Ethics and Morality
Laurie: Personal and Professional Challenges
Mallory: Ethics and Morality
Mandy: Interpersonal Dynamics
Lloyd Russell: Interpersonal Dynamics
Bartlet: Interpersonal Dynamics
Laurie/Brittany: Ethics and Morality
Leo McGarry: Government and Politics
Admiral Fitzwallace: Government and Politics
Sam Seaborn: Government and Politics
C.J. Cregg: Government and Politics
Charlie Young: Interpersonal Dynamics
Josh Lyman: Interpersonal Dynamics
Hoynes: Government and Politics
Katzenmoyer: Government and Politics
Wick: Government and Politics
Tillinghouse: Government and Politics
Leela: Ethics and Morality
Jenny: Personal and Professional Challenges
Charlie: Interpersonal Dynamics
Zoey: Interpersonal Dyna