In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
import random
from dotenv import load_dotenv
import google.generativeai as genai
from google.generativeai import GenerationConfig
import json
import os
import time

# created a schema/format to give to the llm ki aise format ki json me data return kar
import typing_extensions as typing

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def pre_processing(video_id):
    t=YouTubeTranscriptApi.get_transcript(video_id)

    timestamps_prefix_sum = []
    running_total  = 0
    for segment in t:
        sentence = segment["text"]
        curr_words = segment["text"].split()
        running_total += len(curr_words)
        # potential problem here.. the 2nd value of this tuple is sometimes exceeding the length of the vid (in seconds).
        # timestamps_prefix_sum.append((running_total, segment["start"] + segment["duration"], sentence))

        # best solution according to me: use segment["start"] instead of segment["start"] + segment["duration"]
        timestamps_prefix_sum.append((running_total, segment["start"], sentence))

    total_words = timestamps_prefix_sum[-1][0]
    return timestamps_prefix_sum, total_words

pre_processing("7mLWqY7lzSM")
def mod_binary_search(target, timestamps_prefix_sum):
    left = 0
    right = len(timestamps_prefix_sum) - 1
    while left < right:
        mid = (left + right) // 2
        if timestamps_prefix_sum[mid][0] < target:
            left = mid + 1
        else:
            right = mid
    return left 


def split_text_into_chunks(video_id):
    num_questions = random.randint(8, 10) # you can change the number of questions you want to generate
    print(f"Generating {num_questions} questions from the transcript...")
    prefix_sum, total_words = pre_processing(video_id)
    max_words = total_words // num_questions
    parsed = 0
    chunks = []
    while (parsed < total_words):
        if (parsed + 2 * max_words > total_words):
            parsed = total_words
        else:
            parsed = parsed + max_words
        target_index = mod_binary_search(parsed, prefix_sum)
        timestamp = prefix_sum[target_index][1]
        chunk = ' '.join(sentence for _, _, sentence in prefix_sum[:target_index])
        chunks.append((chunk, timestamp))
        prefix_sum = prefix_sum[target_index:]
    return chunks

In [3]:
class QA_data(typing.TypedDict):
    question: str
    answers: list[str]
    correct_answer: str


# Load environment variables, particularly GEMINI_KEY
load_dotenv()

# Configure Gemini API
genai.configure(api_key=os.getenv("GEMINI_KEY"))
config = GenerationConfig(temperature=0.9, response_mime_type="application/json", response_schema=QA_data)

In [4]:
def generate_questions_and_options(chunk):
    try:
        prompt = f'''Generate a question from the following text chunk:\n\n{chunk}\n\nProvide 4 options, with only 1 correct option.
                    Format the output in a dictionary like such'''
        
        # Call the Gemini API
        response = genai.GenerativeModel("gemini-1.5-flash",
                                         system_instruction="You are an expert question maker and quizzer and need to parse some transcript chunks to generate the best questions possible",
                                         generation_config=config)
        result = response.generate_content(prompt)
        
        # parsed the response object
        dict_to_return = json.loads(result.parts[0].text)
        return dict_to_return

    except Exception as e:
        print(f"Error during API request: {e}")
        return "Error generating question and options.", [], ""

In [11]:
import re
def extract_video_id(youtube_url):
    pattern = r'(?:https?://)?(?:www\.)?(?:youtube\.com/(?:v|embed|watch\?v=)|youtu\.be/)([\w-]{11})'
    match = re.search(pattern, youtube_url)
    if match:
        return match.group(1)
    else:
        print("Invalid YouTube URL or no video ID found.")
        return None
youtube_url="https://www.youtube.com/watch?v=ktP8QsPzKfs"
video_id=extract_video_id(youtube_url)

In [12]:
# Sample text transcript
# video_id = "eD16g9RRKtw"

# Split text into chunks

chunks = split_text_into_chunks(video_id)
questions_data = []

# Process each chunk
for i, (chunk, timestamp) in enumerate(chunks):
    print(f"Chunk {i + 1}:\n{chunk}\n")
    question_data = generate_questions_and_options(chunk)

    # wrote the index of the chunk as well before appending
    question_data["timestamp"] = timestamp
    questions_data.append(question_data)
    # break
# Save to JSON file
with open("questions_data.json", "w") as json_file:
    json.dump(questions_data, json_file, indent=4)

print("Collected data has been saved to questions_data.json")

Generating 9 questions from the transcript...


MemoryError: 

: 