In [None]:
%pip install -U sentence-transformers
%pip install -U langchain_core

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [12]:
from pydantic import BaseModel
from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import SentenceTransformer

In [13]:
class Chunk(BaseModel):
    title: str
    content: str
    keywords: list[str]
    named_entities: list[str]
    timestamp_range: str

class ChunkResponse(BaseModel):
    chunks: list[Chunk]

In [14]:
class Chunker:
  def __init__(
    self,
    embed_model: str = "all-MiniLM-L6-v2",
    max_tokens: int = 2_000,
    similarity_threshold: float = 0.82,
    coherence_threshold: float = 0.75,):

    self.embedder = SentenceTransformer(embed_model)
    self.max_tokens = max_tokens
    self.sim_threshold = similarity_threshold
    self.coh_threshold = coherence_threshold

  def TranscriptTextMaker(self, file_path):
    try:
      with open(file_path, "r", encoding="utf-8") as f:
        transcript = f.read()
      print(f"Transcript loaded ({len(self.transcript)} characters)")
    except FileNotFoundError:
        print("Error: transcript.txt file not found")
    except Exception as e:
        print(f"Error loading transcript: {str(e)}")

    return transcript

  def CreatePrompt(self):
    return ChatPromptTemplate.from_template("""
    **SYSTEM PROMPT**
    You are a transcript processing expert. The following transcript needs to be chunked very ingelligently and logically. Ensure sensible segments and structure to be later provided as context to answer questions.

    **INSTRUCTIONS**
    1. Create as many or as few chunks as needed
    2. Each chunk should contain consecutive sentences
    3. For each chunk provide:
      - title: 2-5 word summary
      - content: exact sentences
      - keywords: 3-5 important terms
      - named_entities: any mentioned names
      - timestamp_range: estimate like "00:00-01:30"

    **TRANSCRIPT**
    {input_text}

    **OUTPUT FORMAT**
    {{
      "chunks": [
        {{
          "title": "Summary",
          "content": "Actual sentences",
          "keywords": ["term1", "term2"],
          "named_entities": ["Name"],
          "timestamp_range": "00:00-01:30"
        }}
      ]
    }}
    """)

  def ChunkTranscript(self, llm, file_path, output_result : bool = False) -> list[Chunk]:
    transcript = self.TranscriptTextMaker(file_path)
    try:
      prompt = self.CreatePrompt()
      structured_llm = llm.with_structured_output(ChunkResponse)
      chain = prompt | structured_llm
      response = chain.invoke({"input_text": self.transcript})

      print(f"Generated {len(response.chunks)} chunks")
      if (output_result):
        for i, chunk in enumerate(response.chunks, 1):
          print(f"  Chunk {i}: {chunk.title} ({chunk.timestamp_range}), {chunk.content}")

    except Exception as e:
      print(f"structured output failed: {str(e)}")

    return response.chunks


