In [18]:
from langgraph.graph import START, END, StateGraph
from langgraph.types import Send
from typing import TypedDict
import subprocess
import textwrap
from langchain.chat_models import init_chat_model
from typing_extensions import Annotated
import operator
import base64


import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
llm = init_chat_model("openai:gpt-4o-mini")

api_key = os.getenv("OPENAI_API_KEY")


class State(TypedDict):
    video_file: str
    audio_file: str
    transcription: str
    summaries: Annotated[list[str], operator.add]
    thumbnail_prompts: Annotated[list[str], operator.add]
    thumbnail_sketches: Annotated[list[str], operator.add]
    final_summary: str

In [19]:
def extract_audio(state: State):
    output_file = state["video_file"].replace("mp4", "mp3")
    command = [
        "ffmpeg",
        "-i",
        state["video_file"],
        "-filter:a",
        "atempo=2.0",
        output_file,
        "-y",
    ]
    subprocess.run(command)
    return {
        "audio_file": output_file,
    }


def transcribe_audio(state: State):
    client = OpenAI(api_key=api_key)
    with open(state["audio_file"], "rb") as audio_file:
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            response_format="text",
            file=audio_file,
            language="en",
            prompt="",
        )
        return {"transcription": transcription}


def dispatch_summarizer(state: State):
    transcribtion = state["transcription"]
    chunks = []

    for idx, chunk in enumerate(textwrap.wrap(transcribtion, 500)):
        chunks.append({"id": idx + 1, "chunk": chunk})

    return [Send("summarize_chunk", chunk) for chunk in chunks]


def summarize_chunk(chunk):
    chunk_id = chunk["id"]
    chunk = chunk["chunk"]

    response = llm.invoke(
        f"""
        다음의 텍스트를 요약하세요.
        
        Text: {chunk}
        """
    )
    summary = f"[Chunk {chunk_id} Summary]: {response.content}"
    return {"summaries": [summary]}


def finalize_summary(state: State):
    all_summaries = "\n".join(state["summaries"])

    prompt = f"""
    당신은 하나의 영상에서 나온 텍스트의 여러 청크들로 만든 요약본을 받았습니다.
    
    모든 핵심 포인트를 결합한 종합 요약본을 작성하세요.
    
    Individual Summaries:
    {all_summaries}
    """

    response = llm.invoke(prompt)

    return {"final_summary": response.content}


def dispatch_artists(state: State):
    return [
        Send("generate_thumbnail", {"id": i, "summary": state["final_summary"]})
        for i in [1, 2, 3]
    ]


def generate_thumbnail(args):
    concept_id = args["id"]
    summary = args["summary"]

    prompt = f"""
    이 영상 요약본을 기반으로, 유튜브 썸네일을 만드는 상세한 비주얼 프롬프트를 작성하세요.
    
    시청자들을 끌어들일 수 있도록 다음 내용을 포함시키는 상세한 썸네일 이미지 생성 프롬프트를 작성하세요. :
    - 메인 시각 요소
    - 색상 조합
    - 텍스트 오버레이
    - 전반적인 구도
    
    영상 요약본: {summary}
    """

    response = llm.invoke(prompt)

    thumbnail_prompt = response.content

    client = OpenAI()

    result = client.images.generate(
        model="gpt-image-1",
        prompt=thumbnail_prompt,
        quality="low",
        moderation="low",
        size="auto",
    )

    image_bytes = base64.b64decode(result.data[0].b64_json)

    filename = f"thumbnail_{concept_id}.png"

    with open(filename, "wb") as file:
        file.write(image_bytes)

    return {
        "thumbnail_prompts": [thumbnail_prompt],
        "thumbnail_sketches": [filename],
    }

In [20]:
graph_builder = StateGraph(State)

graph_builder.add_node("extract_audio", extract_audio)
graph_builder.add_node("transcribe_audio", transcribe_audio)
graph_builder.add_node("summarize_chunk", summarize_chunk)
graph_builder.add_node("finalize_summary", finalize_summary)
graph_builder.add_node("generate_thumbnail", generate_thumbnail)

graph_builder.add_edge(START, "extract_audio")
graph_builder.add_edge("extract_audio", "transcribe_audio")
graph_builder.add_conditional_edges(
    "transcribe_audio", dispatch_summarizer, ["summarize_chunk"]
)
graph_builder.add_edge("summarize_chunk", "finalize_summary")
graph_builder.add_conditional_edges(
    "finalize_summary", dispatch_artists, ["generate_thumbnail"]
)
graph_builder.add_edge("generate_thumbnail", END)

graph = graph_builder.compile()


In [21]:
graph.invoke(
    {"video_file": "video.mp4"},
)

{'video_file': 'video.mp4',
 'audio_file': 'video.mp3',
 'transcription': "Running a game requires RAM, which is where the CPU needs to store the game's data for calculation. And of course, graphic assets need to be loaded, so modern games can consume a huge amount of memory. However, there was a time when such sloppy execution wasn't acceptable. There was a masterpiece called Pokemon Red, but the Game Boy it ran on was a paltry device. The CPU performance was so poor that a mistake in coding could cause an explosion. With only 8 kilobits of RAM, uploading data wasn't easy. Creating a single word file today and writing just one sentence would probably exceed 8 kilobits. So, developers at the time had to employ incredible optimization skills when coding. The Pokemon game had a feature called Pokedex, which remembered and counted the Pokemon you already seen. So in the first generation, there were 151 Pokemon, and whether or not you encountered them all had to be recorded in memory. How 