In [1]:
!pip install -q youtube-transcript-api langchain-community langchain-openai \
               faiss-cpu tiktoken python-dotenv

In [2]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

In [3]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound

video_id = "bMgfsdYoEEo"
ytt_api = YouTubeTranscriptApi()

try:
    transcript_data = ytt_api.fetch(video_id, languages=['en-US'])
    transcript = " ".join(chunk.text for chunk in transcript_data)
    print(transcript)
except TranscriptsDisabled:
    print("No captions available for this video.")
except NoTranscriptFound:
    print("No English transcript found.")


- They're both hiding something. - The Devil
has come to Pennsylvania. - Jack and Janet Smurl
claim that an evil presence has found its way
into their home. - Eight people
live in this house. Eight people
have experienced this. Eight people are not crazy. - So, when did this start? - Uh, Heather's confirmation.
- Seventh of September. - There's something
in the attic. - Ed. There's an evil here. Something I've felt before. [haunting music] [thunder crashing] This thing in your house
is a demon. It's the first one
that we've ever encountered. We were young. We were scared. We ran away. And after all these years... [haunting music] it wasn't done with our family. [tapping] - [gasps]
- [tapping] - St. Michael, defend us. Be our safeguard
against the Devil. In nomine Patris, et Filii, et Spiritus Sanctus. [screams] By the power
of our Lord Jesus Christ! - [screaming]
- [screaming] - I don't know
how to protect you this time. - [evil laughter]
- [whimpers] Lorraine! - Ed, get down here! - L

In [4]:
transcript_data

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text="- They're both hiding something.", start=1.401, duration=1.535), FetchedTranscriptSnippet(text='- The Devil\nhas come to Pennsylvania.', start=3.103, duration=3.036), FetchedTranscriptSnippet(text='- Jack and Janet Smurl\nclaim that an evil presence', start=7.841, duration=2.269), FetchedTranscriptSnippet(text='has found its way\ninto their home.', start=10.277, duration=1.335), FetchedTranscriptSnippet(text='- Eight people\nlive in this house.', start=11.778, duration=2.47), FetchedTranscriptSnippet(text='Eight people\nhave experienced this.', start=14.848, duration=2.336), FetchedTranscriptSnippet(text='Eight people are not crazy.', start=18.385, duration=2.336), FetchedTranscriptSnippet(text='- So, when did this start?', start=26.46, duration=2.035), FetchedTranscriptSnippet(text="- Uh, Heather's confirmation.\n- Seventh of September.", start=28.662, duration=2.336), FetchedTranscriptSnippet(text="- There's something\nin the

In [5]:
# Save transcript as plain text (.txt) file
with open(f"{video_id}_transcript.txt", "w", encoding="utf-8") as txt_file:
    txt_file.write(transcript)

# Save transcript as a PDF (.pdf) file using FPDF
try:
    from fpdf import FPDF
except ImportError:
    import sys
    !{sys.executable} -m pip install fpdf
    from fpdf import FPDF

pdf = FPDF()
pdf.add_page()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.set_font("Arial", size=12)

# Optionally, split long transcript into lines for clean PDF formatting
for line in transcript.split('\n'):
    pdf.multi_cell(0, 10, line)
pdf.output(f"{video_id}_transcript.pdf")

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=89990a7ee8ce6363605fb509281c742e7da81defaca70c3405fdc749e0f91e0c
  Stored in directory: /root/.cache/pip/wheels/6e/62/11/dc73d78e40a218ad52e7451f30166e94491be013a7850b5d75
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


''

In [6]:
def search_transcript_with_timestamps(transcript_chunks, query, window=30):
    """
    Search transcript chunks for keyword/phrase and return snippets with timestamps.

    Args:
      transcript_chunks: list of transcript chunks, each with .text, .start (seconds)
      query: search keyword or phrase (string)
      window: number of characters before and after match to include in snippet

    Returns:
      List of dictionaries with 'timestamp' (in seconds) and 'snippet' (str with **highlighted** query).
    """
    results = []
    query_lower = query.lower()
    for chunk in transcript_chunks:
        chunk_text = chunk.text.lower()
        if query_lower in chunk_text:
            start_index = chunk_text.find(query_lower)
            snippet_start = max(0, start_index - window)
            snippet_end = min(len(chunk.text), start_index + len(query) + window)
            snippet = (chunk.text[snippet_start:start_index] + "**" +
                       chunk.text[start_index:start_index+len(query)] + "**" +
                       chunk.text[start_index+len(query):snippet_end])
            results.append({
                "timestamp": chunk.start,
                "snippet": snippet
            })
    return results


In [7]:
# Example search query
search_query = "They're both hiding"  # Replace with any user input

search_results = search_transcript_with_timestamps(transcript_data, search_query)

if not search_results:
    print(f"No matches found for '{search_query}'.")
else:
    print(f"Found {len(search_results)} matches for '{search_query}':\n")
    for i, res in enumerate(search_results, 1):
        minutes = int(res['timestamp'] // 60)
        seconds = int(res['timestamp'] % 60)
        print(f"Match {i} at {minutes}:{seconds:02d} -> {res['snippet']}\n")


Found 1 matches for 'They're both hiding':

Match 1 at 0:01 -> - **They're both hiding** something.



In [8]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [9]:
len(chunks)

2

In [10]:
chunks[1]

Document(metadata={}, page_content="of our Lord Jesus Christ! - [screaming]\n- [screaming] - I don't know\nhow to protect you this time. - [evil laughter]\n- [whimpers] Lorraine! - Ed, get down here! - Lorraine!\n- We need your help! - Lorraine! [lullaby melody]")

In [15]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks, embeddings)

In [16]:
vector_store.index_to_docstore_id

{0: '894cdada-5e68-4f78-9d0c-e21dbdbf1214',
 1: 'f8545db8-2784-402f-8aff-c34b93173b58'}

In [17]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [18]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f66297ae510>, search_kwargs={'k': 4})

In [19]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

In [20]:
prompt = PromptTemplate(
    template="""
You are a precise and factual assistant. Your job is to answer questions strictly based on the video transcript provided.

Transcript:
{context}

User Question:
{question}

Instructions:
- Only use the transcript above to answer.
- If the transcript does not contain the answer, respond exactly with:
  "I'm sorry, the answer is not available in the video."
- If the transcript only partially answers, say:
  "The transcript only mentions XYZ but does not provide a full answer."
  (Replace XYZ with the relevant part mentioned in the transcript.)
- Do not add information from outside the transcript.
- Keep the response clear, concise, and directly relevant to the question.
""",
    input_variables=['context', 'question']
)



In [21]:
question1          = "What is this video about? Can you give me a brief summary of the transcript?"
retrieved_docs    = retriever.invoke(question1)

In [22]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

"- They're both hiding something. - The Devil\nhas come to Pennsylvania. - Jack and Janet Smurl\nclaim that an evil presence has found its way\ninto their home. - Eight people\nlive in this house. Eight people\nhave experienced this. Eight people are not crazy. - So, when did this start? - Uh, Heather's confirmation.\n- Seventh of September. - There's something\nin the attic. - Ed. There's an evil here. Something I've felt before. [haunting music] [thunder crashing] This thing in your house\nis a demon. It's the first one\nthat we've ever encountered. We were young. We were scared. We ran away. And after all these years... [haunting music] it wasn't done with our family. [tapping] - [gasps]\n- [tapping] - St. Michael, defend us. Be our safeguard\nagainst the Devil. In nomine Patris, et Filii, et Spiritus Sanctus. [screams] By the power\nof our Lord Jesus Christ! - [screaming]\n- [screaming] - I don't know\nhow to protect you this time. - [evil laughter]\n\nof our Lord Jesus Christ! - [

In [23]:
final_prompt = prompt.invoke({"context": context_text, "question": question1})

In [24]:
answer = llm.invoke(final_prompt)
print(answer.content)

The transcript mentions a couple, Jack and Janet Smurl, who claim that an evil presence, described as a demon, has invaded their home in Pennsylvania. It discusses the experiences of eight people living in the house, indicating that they are not crazy. The haunting began during Heather's confirmation on September 7th, and there are references to attempts to protect themselves through prayers and the invocation of St. Michael. The situation appears to be dire, with characters expressing fear and a sense of urgency for help. 

The transcript only mentions the experiences of Jack and Janet Smurl and their family's encounters with the evil presence but does not provide a full answer.


In [25]:
question2        = "What is the RAG?"
retrieved_docs    = retriever.invoke(question2)

In [26]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

"of our Lord Jesus Christ! - [screaming]\n- [screaming] - I don't know\nhow to protect you this time. - [evil laughter]\n- [whimpers] Lorraine! - Ed, get down here! - Lorraine!\n- We need your help! - Lorraine! [lullaby melody]\n\n- They're both hiding something. - The Devil\nhas come to Pennsylvania. - Jack and Janet Smurl\nclaim that an evil presence has found its way\ninto their home. - Eight people\nlive in this house. Eight people\nhave experienced this. Eight people are not crazy. - So, when did this start? - Uh, Heather's confirmation.\n- Seventh of September. - There's something\nin the attic. - Ed. There's an evil here. Something I've felt before. [haunting music] [thunder crashing] This thing in your house\nis a demon. It's the first one\nthat we've ever encountered. We were young. We were scared. We ran away. And after all these years... [haunting music] it wasn't done with our family. [tapping] - [gasps]\n- [tapping] - St. Michael, defend us. Be our safeguard\nagainst the D

In [27]:
final_prompt = prompt.invoke({"context": context_text, "question": question2})

In [28]:
answer = llm.invoke(final_prompt)
print(answer.content)

I'm sorry, the answer is not available in the video.
