# Transcript extraction

## Audio Transcript

### Video Segmentation

In [75]:
import cv2

def segment_video(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)  # Get frames per second
    segments = []
    last_frame = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        current_frame = int(cap.get(cv2.CAP_PROP_POS_FRAMES))

        # Detect slide changes using frame difference or another method
        if current_frame % 100 == 0:  # Example condition to segment every 100 frames
            if last_frame != 0:
                start_time = last_frame / fps
                end_time = current_frame / fps
                segments.append((start_time, end_time))
            last_frame = current_frame

    cap.release()
    return segments

# Example usage
video_path = '/content/1.mp4'
segments = segment_video(video_path)
print(segments)


[(3.336666666666667, 6.673333333333334), (6.673333333333334, 10.01), (10.01, 13.346666666666668), (13.346666666666668, 16.683333333333334), (16.683333333333334, 20.02), (20.02, 23.356666666666666), (23.356666666666666, 26.693333333333335), (26.693333333333335, 30.03), (30.03, 33.36666666666667), (33.36666666666667, 36.70333333333333), (36.70333333333333, 40.04), (40.04, 43.376666666666665), (43.376666666666665, 46.71333333333333), (46.71333333333333, 50.050000000000004), (50.050000000000004, 53.38666666666667), (53.38666666666667, 56.723333333333336), (56.723333333333336, 60.06), (60.06, 63.39666666666667), (63.39666666666667, 66.73333333333333), (66.73333333333333, 70.07000000000001), (70.07000000000001, 73.40666666666667), (73.40666666666667, 76.74333333333334), (76.74333333333334, 80.08), (80.08, 83.41666666666667), (83.41666666666667, 86.75333333333333), (86.75333333333333, 90.09), (90.09, 93.42666666666666), (93.42666666666666, 96.76333333333334), (96.76333333333334, 100.100000000

### Using Speech Recognition

In [76]:
# !pip install SpeechRecognition

In [77]:
# import os
# import speech_recognition as sr
# import moviepy.editor as mp

# def extract_transcript(video_path, segments):
#     recognizer = sr.Recognizer()
#     transcript = ""

#     # Create directory for audio segments if it doesn't exist
#     audio_dir = "/content/audio_segment/"
#     os.makedirs(audio_dir, exist_ok=True)

#     for segment in segments:
#         if isinstance(segment, tuple) and len(segment) == 2:
#             # Extract audio from video segment
#             audio_path = os.path.join(audio_dir, f"audio_segment_{int(segment[0])}_{int(segment[1])}.wav")
#             video_clip = mp.VideoFileClip(video_path).subclip(segment[0], segment[1])
#             video_clip.audio.write_audiofile(audio_path)

#             # Recognize speech from audio
#             with sr.AudioFile(audio_path) as source:
#                 audio = recognizer.record(source)
#                 try:
#                     segment_text = recognizer.recognize_google(audio)
#                     transcript += segment_text + " "
#                 except sr.UnknownValueError:
#                     transcript += "[Unrecognized Audio] "
#                 except sr.RequestError:
#                     transcript += "[Error Fetching Transcript] "
#         else:
#             print(f"Invalid segment format: {segment}. Expected a tuple with (start_time, end_time).")

#     return transcript

# # Example usage
# # transcript = extract_transcript(video_path, segments)
# # print(transcript)


### Using Whisper

In [78]:
!pip install -U openai-whisper



### Audio extraction

In [79]:
from moviepy.editor import VideoFileClip

def extract_audio(input_file, output_file):
    try:
        video = VideoFileClip(input_file)
        audio = video.audio
        audio.write_audiofile(output_file, codec='libmp3lame')  # Specify the codec for MP3 format
        print("Audio extracted successfully!")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    # input_file = "/content/1.mp4"  # Change this to your input video file
    input_file = "/content/1.mp4"  # Change this to your input video file
    output_file = "/content/1.mp3"  # Change this to the desired output audio file

    extract_audio(input_file, output_file)


MoviePy - Writing audio in /content/1.mp3


                                                                       

MoviePy - Done.
Audio extracted successfully!




In [80]:
import whisper
import warnings
import math
warnings.simplefilter("ignore")

def create_timestamps(segments):
    timestamps = []
    for segment in segments:
        start_time = segment['start']
        end_time = segment['end']
        timestamps.append((start_time, end_time))
    return timestamps

def format_timestamp(seconds):
    minutes = math.floor(seconds / 60)
    seconds = math.floor(seconds % 60)
    return f"{minutes:02}:{seconds:02}"

model = whisper.load_model("tiny")
# Transcribe audio
result = model.transcribe(audio="/content/1.mp3", task='transcription')

# Check the keys in the result dictionary
print("Keys in result:", result.keys())

# Extract segments from the result
segments = result['segments']
print("Segments:", segments)

# Create timestamps from segments
timestamps = create_timestamps(segments)
print("Timestamps:", timestamps)

# Initialize a variable to hold the transcript
audio_transcript = ""

# Print transcribed text with formatted timestamps and save it into audio_transcript
for i, (segment, (start_time, end_time)) in enumerate(zip(segments, timestamps)):
    text = segment['text']
    start_formatted = format_timestamp(start_time)
    end_formatted = format_timestamp(end_time)
    audio_transcript += f"Timestamps ({start_formatted} - {end_formatted}): {text}\n"

# Print the complete audio transcript
print("Audio Transcript:\n", audio_transcript)


Keys in result: dict_keys(['text', 'segments', 'language'])
Segments: [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 4.0, 'text': ' Hello everybody, this is the Racing Rashput welcome to Magnet Bains.', 'tokens': [50364, 2425, 2201, 11, 341, 307, 264, 38832, 46298, 2582, 2928, 281, 19664, 302, 363, 2315, 13, 50564], 'temperature': 0.0, 'avg_logprob': -0.6969391419006897, 'compression_ratio': 1.8284518828451883, 'no_speech_prob': 0.14133398234844208}, {'id': 1, 'seek': 0, 'start': 4.0, 'end': 7.0, 'text': ' This video is going to start from a new chapter.', 'tokens': [50564, 639, 960, 307, 516, 281, 722, 490, 257, 777, 7187, 13, 50714], 'temperature': 0.0, 'avg_logprob': -0.6969391419006897, 'compression_ratio': 1.8284518828451883, 'no_speech_prob': 0.14133398234844208}, {'id': 2, 'seek': 0, 'start': 7.0, 'end': 12.0, 'text': ' Class 8 history chapter, chapter 1, ruling the countryside.', 'tokens': [50714, 9471, 1649, 2503, 7187, 11, 7187, 502, 11, 21437, 264, 28252, 13, 50964], 'temperatur

## Transcript extracted from Image Capturing

### Extracting Text from Image

In [81]:
import cv2

def extract_frames(video_path, start_time, frame_interval=30):
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frames = []
    frame_count = 0

    # Total frames to skip for 30-second intervals
    frame_skip = frame_interval * fps

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Start processing frames after the specified start time (in seconds)
        if frame_count >= start_time * fps and frame_count % frame_skip == 0:
            frames.append(frame)

        frame_count += 1

    cap.release()
    return frames


In [82]:
!pip install ultralytics



In [83]:
from ultralytics import YOLO

def detect_teacher_in_frame(model, frame):
    results = model(frame)
    for r in results:
        for box in r.boxes:
            label = box.cls[0]
            if label == 'person':  # Assuming 'person' class is indexed as the teacher
                return box.xyxy[0].tolist()  # Return bounding box of the teacher
    return None


In [84]:
def select_keyframes(frames, model):
    """
    Selects keyframes for each slide, picking frames where the teacher covers the least amount of text.

    Args:
    - frames: A list of frames extracted from the video.
    - model: YOLO model used for teacher detection.

    Returns:
    - keyframes: A list of selected keyframes, one for each detected slide.
    """
    keyframes = []
    current_slide_frames = []
    min_teacher_area = float('inf')
    best_frame = None

    for frame in frames:
        teacher_box = detect_teacher_in_frame(model, frame)

        if teacher_box:
            # Calculate teacher area in the frame
            x1, y1, x2, y2 = teacher_box
            area = (x2 - x1) * (y2 - y1)

            if area < min_teacher_area:
                min_teacher_area = area
                best_frame = frame

            # If the frame belongs to the same slide, accumulate it
            current_slide_frames.append(frame)
        else:
            # If no teacher detected, assume a slide change
            if best_frame:
                keyframes.append(best_frame)  # Save the best frame for the current slide
            else:
                keyframes.append(frame)  # Save the frame if no teacher detected

            # Reset for the next slide
            current_slide_frames = []
            min_teacher_area = float('inf')
            best_frame = None

    # Add the last slide's keyframe if not already added
    if best_frame:
        keyframes.append(best_frame)

    return keyframes


In [85]:
!apt-get install -y tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [86]:
!pip install pytesseract



In [87]:
import re
import numpy as np
import pytesseract

def clean_ocr_text(text):
    """
    Cleans the OCR text by removing unwanted characters, extra spaces,
    random numbers, and correcting any known OCR errors.
    """
    # Remove any standalone numbers, special characters, and unwanted symbols
    text = re.sub(r'\b\d+\b', '', text)  # Remove isolated numbers
    text = re.sub(r'[^\w\s.,!?-]', '', text)  # Keep only words, spaces, and basic punctuation
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces or newlines with a single space
    text = text.strip()  # Remove leading/trailing spaces

    return text

def format_text_with_bullets(text):
    """
    Formats the cleaned text by splitting it into sentences and adding bullet points,
    with multiple line spaces between each sentence.
    """
    # Clean the text before formatting
    text = clean_ocr_text(text)

    # Split text into sentences using punctuation as delimiters
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Add bullet points and separate sentences with three line breaks
    formatted_text = "\n\n\n".join([f"- {sentence.strip()}" for sentence in sentences if sentence.strip()])

    return formatted_text

def extract_text_from_frame(frame):
    """
    Extracts text from the given frame using pytesseract OCR, then cleans
    and formats it by adding bullet points and separating sentences.
    """

    if frame is None or not isinstance(frame, np.ndarray):
        print("Error: Invalid frame.")
        return ""

    # Convert the frame to grayscale for better OCR performance
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Apply OCR to extract text from the frame
    text = pytesseract.image_to_string(gray)

    # Format the OCR text by adding bullet points and multiple line spaces
    formatted_text = format_text_with_bullets(text)

    return formatted_text


In [88]:
def process_video(video_path, start_time=30, frame_interval=5):
    # Load the YOLO model
    model = YOLO("yolov8n.pt")

    # Step 1: Extract frames after 30 seconds
    frames = extract_frames(video_path, start_time, frame_interval)

    # Step 2: Detect and select keyframes with the least obstruction using the YOLO model
    keyframes = select_keyframes(frames, model)

    # Step 3: Extract and combine text from multiple keyframes using OCR
    extracted_texts = []

    for keyframe in keyframes:
        if keyframe is not None:
            text = extract_text_from_frame(keyframe)
            if text:
                extracted_texts.append(text)

    # Combine all the extracted texts
    combined_text = "\n\n\n".join(extracted_texts)

    return combined_text

# Example Usage:
video_path = "/content/1.mp4"
# video_path = "/content/2.mp4"
extracted_text = process_video(video_path)
print(extracted_text)



0: 384x640 5 persons, 418.2ms
Speed: 2.2ms preprocess, 418.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 209.4ms
Speed: 2.0ms preprocess, 209.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 241.1ms
Speed: 2.0ms preprocess, 241.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 remote, 213.0ms
Speed: 1.9ms preprocess, 213.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 290.4ms
Speed: 3.0ms preprocess, 290.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 217.2ms
Speed: 3.9ms preprocess, 217.2ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 250.5ms
Speed: 9.0ms preprocess, 250.5ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 222.9ms
Speed: 2.5ms preprocess, 222.9ms inference, 1.6ms postproces

In [89]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    # Clean text of extra spaces
    text = " ".join(text.split())  # Normalize spaces
    return text

def extract_key_points(text):
    # Apply NLP processing
    doc = nlp(text)

    key_points = []

    # Loop through sentences
    for sent in doc.sents:
        # Check for entities or key words (nouns/verbs)
        entities = [ent.text for ent in sent.ents]
        key_words = [token.lemma_ for token in sent if token.pos_ in ['NOUN', 'VERB', 'PROPN']]

        # If the sentence contains entities or key words, consider it important
        if entities or key_words:
            key_points.append(sent.text.strip())

    return key_points

def remove_duplicate_sentences(sentences, threshold=0.8):
    # Vectorize the sentences
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()

    # Calculate similarity between each pair of sentences
    similarity_matrix = cosine_similarity(vectors)

    # List to store unique sentences
    unique_sentences = []

    # Track whether a sentence is too similar to an earlier one
    for i, sentence in enumerate(sentences):
        if not any(similarity_matrix[i][j] > threshold for j in range(i)):
            unique_sentences.append(sentence)

    return unique_sentences

def format_output(points):
    # Format the extracted points as bullet points
    formatted_points = [f"- {point}" for point in points]
    return "\n".join(formatted_points)

# Example raw text
raw_text = extracted_text

# Process the text
cleaned_text = clean_text(raw_text)
key_points = extract_key_points(cleaned_text)
unique_points = remove_duplicate_sentences(key_points, threshold=0.8)
formatted_output = format_output(unique_points)

print(formatted_output)


- - PUN Topas eeel iil Introduction Relate the story with previous chapter.
- - Battle of Plassey, Bauxar Connect with ruling the countryside - oN fORY - CHAPTER - RULING THE COUNTRYSIDE Introduction Relate the story with previous chapter.
- - Battle of Plassey, Bauxar Connect with ruling the cou pal ruler in - oN fORY - CHAPTER - RULING
- THE COUNTRYSIDE Introduction Relate the story with previous chapter.
- - Battle of Plassey, Bauxar Jo the Mughal ruler in - PUR cpio lls Introduction Relate the story with previous chapter.- Battle of Plassey, Bauxar Connect with ruling t sa from .
- - ruler in , Robert Clive accepting the Di - oN fORY - CHAPTER - RULING THE COUNTRYSIDE Introduction Relate the story with previous chapter.
- - Battle of Plassey, Bauxar Connect with intryside Robert Clive accept Bn the Mughal ulin - PUR cutee ils Introduction Relate the story with previous chapter.
- - Battle of Plassey, Bauxar Connect with ruling th Robert Clive accepting the Diwani of Ey ye Mughal ru

In [90]:
!pip install transformers sentence-transformers langchain langchain-community langchain-openai faiss-cpu



In [91]:
import os

from langchain_openai import ChatOpenAI # pass the model to use
from langchain.document_loaders import UnstructuredFileLoader
from langchain_community.vectorstores import FAISS # perform similarity search on vector embeddings
from langchain.embeddings import HuggingFaceEmbeddings # convert text into vector embeddings
from langchain.text_splitter import CharacterTextSplitter # split text or doc in to smaller chunks
from langchain.chains import RetrievalQA # retrieveing information from the embeddings

In [92]:
os.environ["OPENAI_API_KEY"]=""

In [93]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [94]:
!pip install --upgrade nltk



In [107]:
# Assuming the input text is stored in a variable called `input_text`

# Input text (replace this with your desired string)
input_text = formatted_output

# Split the text into chunks
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=20)
documents = [Document(page_content=input_text)]  # Using Document object to simulate document structure
texts = text_splitter.split_documents(documents)

# Loading vector embedding model
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings()
kb = FAISS.from_documents(texts, embeddings)

# Initialize OpenAI LLM for context-based processing
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# Function to use LLM to correct spellings, remove duplicates, and format output
def process_text_with_llm(input_text):
    prompt = (
        "You are given the following text extracted from an image. "
        "Please correct any spelling mistakes, remove duplicate points, and provide a neat and concise formatted version of the text:\n\n"
        f"{input_text}\n\n"
        "Return the corrected and formatted text."
    )
    response = llm([HumanMessage(content=prompt)])
    return response.content

# Process the input text
formatted_result = process_text_with_llm(input_text)
print(formatted_result)


- Introduction: Relate the story with previous chapter.
- Battle of Plassey, Bauxar: Connect with ruling the countryside
- Chapter: Ruling the Countryside
- Battle of Plassey, Bauxar: Connect with ruling the local ruler
- Battle of Plassey, Bauxar: Connect with ruling the Mughal ruler
- Robert Clive accepting the Diwani of the Mughal ruler
- Robert Clive accepting the Diwani of Bengal
- Robert Clive accepting the Diwani of Bengal, Bihar and Orissa from the Mughal ruler
- What we are going to study in this chapter?
- The company becomes a Diwan
- Revenue for the company
- Agriculture
- Problems Solutions: Mahalwari and Ryotwari system
- Crops for Europe: Indigo
- The Blue Rebellion and after
- The company becomes a Diwan
- Revenue for the company
- Problems Solutions: Mahalwari and Ryotwari system
- Crops for Europe: Indigo
- The Blue Rebellion and after
- The company becomes a Diwan
- Revenue for the company
- Problems Solutions: Mahalwari and Ryotwari system
- Crops for Europe: Indigo

In [108]:
def remove_duplicates(input_text):
    lines = input_text.split('\n')
    unique_lines = []
    seen = set()

    for line in lines:
        line_cleaned = line.strip()
        if line_cleaned and line_cleaned not in seen:
            unique_lines.append(line)
            seen.add(line_cleaned)

    return '\n'.join(unique_lines)

# Removing duplicates
formatted_result = remove_duplicates(formatted_result)

# Output the unique text
print("Unique Text:\n", formatted_result)

Unique Text:
 - Introduction: Relate the story with previous chapter.
- Battle of Plassey, Bauxar: Connect with ruling the countryside
- Chapter: Ruling the Countryside
- Battle of Plassey, Bauxar: Connect with ruling the local ruler
- Battle of Plassey, Bauxar: Connect with ruling the Mughal ruler
- Robert Clive accepting the Diwani of the Mughal ruler
- Robert Clive accepting the Diwani of Bengal
- Robert Clive accepting the Diwani of Bengal, Bihar and Orissa from the Mughal ruler
- What we are going to study in this chapter?
- The company becomes a Diwan
- Revenue for the company
- Agriculture
- Problems Solutions: Mahalwari and Ryotwari system
- Crops for Europe: Indigo
- The Blue Rebellion and after
- The Blue


## Combining Audio Transcript and Image Transcript

In [96]:
# Function to clean and combine transcripts from audio and images
def combine_transcripts(audio_transcript, image_transcript):
    # Cleaning up transcripts by removing extra spaces and line breaks
    audio_transcript_clean = re.sub(r'\s+', ' ', audio_transcript).strip()
    image_transcript_clean = re.sub(r'\s+', ' ', image_transcript).strip()

    # Combining transcripts
    combined_transcript = f"Audio Transcript: {audio_transcript_clean}\n\nImage Transcript: {image_transcript_clean}"
    return combined_transcript

# Example usage with sample transcripts
# audio_transcript = "This is an example audio transcript extracted using Whisper."
image_transcript = formatted_result
combined_transcript = combine_transcripts(audio_transcript, image_transcript)

In [127]:
print(combined_transcript)

Audio Transcript: Timestamps (00:00 - 00:04): Hello everybody, this is the Racing Rashput welcome to Magnet Bains. Timestamps (00:04 - 00:07): This video is going to start from a new chapter. Timestamps (00:07 - 00:12): Class 8 history chapter, chapter 1, ruling the countryside. Timestamps (00:12 - 00:14): The interesting chapter is going to be, Timestamps (00:14 - 00:18): now we have to connect the class 8 history in the class 8 history. Timestamps (00:18 - 00:22): We have seen a basic outline of history in our history of modern history. Timestamps (00:22 - 00:26): When the British are here, but the modern one is not seen in the modern world, Timestamps (00:26 - 00:30): this is theings to say the Christian history well. Timestamps (00:30 - 00:33): Especially, until British has hadís experience, Timestamps (00:33 - 00:37): but from that straight to the history, Timestamps (00:37 - 00:41): the grass from dirt and waste have been settled in a Triccup territory. Timestamps (00:41 - 00:45)

# Summarisation/RAG Implementation

In [97]:
!pip install transformers sentence-transformers langchain langchain-community langchain-openai faiss-cpu unstructured unstructured[pdf]



In [98]:
import os
import re
import json
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredFileLoader
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import ChatOpenAI

In [99]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

In [100]:
# Splitting the combined transcript into smaller chunks for RAG
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_text(combined_transcript)



In [101]:
# Creating a FAISS vector store for retrieval
embeddings = HuggingFaceEmbeddings()
vector_store = FAISS.from_texts(documents, embeddings)

In [102]:
# Setting up the RAG model
retriever = vector_store.as_retriever()
rag_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [103]:
# Generating a summary of the video
query = "Provide a summary with appropriate headers and detailed content under each header."
response = rag_chain.invoke({'query': query})['result']

# Output the summary
print("Summary:\n", response)

Summary:
 **Chapter Summary: Ruling the Countryside**

**Introduction and Historical Context:**
The chapter delves into the British East India Company's control and administration of Bengal, Bihar, and Odisha after the Battle of Plassey and Buxar. This period marked a significant shift in power dynamics.

**The Company Becomes a Diwan:**
In 1765, Robert Clive accepted the Diwani rights from the Mughal ruler, allowing the British East India Company to collect revenue in the three provinces. This move consolidated the Company's authority and financial power.

**Revenue Generation for the Company:**
The Company's primary focus was the collection of revenue, which was largely extracted from the agricultural sector. This revenue stream was crucial for the Company's operations and growth in India.

**Problems and Solutions: Mahalwari and Ryotwari System:**
The Company faced challenges in revenue collection, leading to the implementation of the Mahalwari and Ryotwari systems. These systems ai

## Summary with Timestamp

In [133]:
import re
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredFileLoader
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import ChatOpenAI

# Extracting timestamps and content from combined_transcript
lines = combined_transcript.splitlines()
print("Transcript Lines:", lines)  # Debugging output to inspect transcript lines
documents_with_timestamps = []

Transcript Lines: ['Audio Transcript: Timestamps (00:00 - 00:04): Hello everybody, this is the Racing Rashput welcome to Magnet Bains. Timestamps (00:04 - 00:07): This video is going to start from a new chapter. Timestamps (00:07 - 00:12): Class 8 history chapter, chapter 1, ruling the countryside. Timestamps (00:12 - 00:14): The interesting chapter is going to be, Timestamps (00:14 - 00:18): now we have to connect the class 8 history in the class 8 history. Timestamps (00:18 - 00:22): We have seen a basic outline of history in our history of modern history. Timestamps (00:22 - 00:26): When the British are here, but the modern one is not seen in the modern world, Timestamps (00:26 - 00:30): this is theings to say the Christian history well. Timestamps (00:30 - 00:33): Especially, until British has hadís experience, Timestamps (00:33 - 00:37): but from that straight to the history, Timestamps (00:37 - 00:41): the grass from dirt and waste have been settled in a Triccup territory. Timest

In [136]:
import re

# Assuming 'lines' contains all the transcript lines including the image transcripts
documents_with_timestamps = []

for line in lines:
    # Split each line by the "Timestamps" keyword to handle multiple timestamps per line
    segments = re.split(r"(?=Timestamps \(\d{2}:\d{2} - \d{2}:\d{2}\):)", line)
    for segment in segments:
        # Handle audio transcripts with timestamps
        match = re.match(r"Timestamps\s*\((\d{2}:\d{2})\s*-\s*(\d{2}:\d{2})\):\s*(.*)", segment)
        if match:
            start_time, end_time, content = match.groups()
            content_with_timestamp = f"({start_time} - {end_time}): {content}"
            documents_with_timestamps.append(content_with_timestamp)
        # Handle image transcripts
        elif "Image Transcript" in segment:
            # Add the entire segment for image transcripts
            image_content = segment.replace("Image Transcript: ", "").strip()
            if image_content:
                documents_with_timestamps.append(image_content)
        else:
            # Debugging output for segments that don't match any expected format
            if segment.strip():
                print(f"Segment not matching: {segment}")

# Combine documents with timestamps into a single string
combined_content = "\n".join(documents_with_timestamps)

# Check if combined content is empty
if not combined_content.strip():
    raise ValueError("The combined transcript is empty or contains only whitespace.")
else:
    print("Combined Transcript Content:\n", combined_content)


Segment not matching: Audio Transcript: 
Combined Transcript Content:
 (00:00 - 00:04): Hello everybody, this is the Racing Rashput welcome to Magnet Bains. 
(00:04 - 00:07): This video is going to start from a new chapter. 
(00:07 - 00:12): Class 8 history chapter, chapter 1, ruling the countryside. 
(00:12 - 00:14): The interesting chapter is going to be, 
(00:14 - 00:18): now we have to connect the class 8 history in the class 8 history. 
(00:18 - 00:22): We have seen a basic outline of history in our history of modern history. 
(00:22 - 00:26): When the British are here, but the modern one is not seen in the modern world, 
(00:26 - 00:30): this is theings to say the Christian history well. 
(00:30 - 00:33): Especially, until British has hadís experience, 
(00:33 - 00:37): but from that straight to the history, 
(00:37 - 00:41): the grass from dirt and waste have been settled in a Triccup territory. 
(00:41 - 00:45): Second a wider space and a higher steel space requires less Scott.

In [135]:
# Combine documents with timestamps into a single string
combined_content = "\n".join(documents_with_timestamps)

# Check if combined content is empty
if not combined_content.strip():
    raise ValueError("The combined transcript is empty or contains only whitespace.")
else:
    print("Combined Transcript Content:\n", combined_content)

Combined Transcript Content:
 (00:00 - 00:04): Hello everybody, this is the Racing Rashput welcome to Magnet Bains. 
(00:04 - 00:07): This video is going to start from a new chapter. 
(00:07 - 00:12): Class 8 history chapter, chapter 1, ruling the countryside. 
(00:12 - 00:14): The interesting chapter is going to be, 
(00:14 - 00:18): now we have to connect the class 8 history in the class 8 history. 
(00:18 - 00:22): We have seen a basic outline of history in our history of modern history. 
(00:22 - 00:26): When the British are here, but the modern one is not seen in the modern world, 
(00:26 - 00:30): this is theings to say the Christian history well. 
(00:30 - 00:33): Especially, until British has hadís experience, 
(00:33 - 00:37): but from that straight to the history, 
(00:37 - 00:41): the grass from dirt and waste have been settled in a Triccup territory. 
(00:41 - 00:45): Second a wider space and a higher steel space requires less Scott. 
(00:45 - 00:47): Second, this process b

In [137]:
# Split documents into chunks for RAG
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)  # Adjusted chunk size and overlap
documents = text_splitter.split_text(combined_content)

# Check if documents list is empty
if len(documents) == 0:
    raise ValueError("Documents list is empty, cannot proceed with embedding.")
else:
    print("Documents After Splitting:", documents)

Documents After Splitting: ['(00:00 - 00:04): Hello everybody, this is the Racing Rashput welcome to Magnet Bains. \n(00:04 - 00:07): This video is going to start from a new chapter. \n(00:07 - 00:12): Class 8 history chapter, chapter 1, ruling the countryside. \n(00:12 - 00:14): The interesting chapter is going to be, \n(00:14 - 00:18): now we have to connect the class 8 history in the class 8 history. \n(00:18 - 00:22): We have seen a basic outline of history in our history of modern history. \n(00:22 - 00:26): When the British are here, but the modern one is not seen in the modern world, \n(00:26 - 00:30): this is theings to say the Christian history well. \n(00:30 - 00:33): Especially, until British has hadís experience, \n(00:33 - 00:37): but from that straight to the history, \n(00:37 - 00:41): the grass from dirt and waste have been settled in a Triccup territory. \n(00:41 - 00:45): Second a wider space and a higher steel space requires less Scott. \n(00:45 - 00:47): Second, thi

In [138]:
# Creating FAISS vector store for retrieval
embeddings = HuggingFaceEmbeddings()

# Generate embeddings for the documents
vector_store = FAISS.from_texts(documents, embeddings)


In [139]:
# Setting up the RAG model
retriever = vector_store.as_retriever()
rag_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [140]:
# Generating a summary with timestamps embedded
query = "Provide a summary of the content, including the relevant timestamps where each key concept is descriptievely mentioned."
response = rag_chain.invoke({'query': query})['result']

# Output the summary with timestamps
print("Summary with Timestamps:\n", response)

Summary with Timestamps:
 The video discusses the history chapter "Ruling the Countryside" from Class 8. Here is a summary of the content with relevant timestamps:

1. **Introduction to the Chapter**:
   - The video starts with an introduction to the new chapter in Class 8 history, "Ruling the Countryside" (00:07 - 00:12).
   - It connects the previous chapter with the current one (01:36 - 01:38).

2. **Battle of Plassey and Buxar**:
   - The video mentions the Battle of Plassey and Buxar, connecting it with ruling the countryside (02:21 - 02:23).
   - It talks about Robert Clive accepting the Diwani of Bengal, Bihar, and Orissa from the Mughal ruler (03:36 - 03:43).

3. **Company Becomes a Diwan**:
   - The video discusses how the company becomes a Diwan and the revenue for the company (05:18 - 05:20).

4. **Problems and Solutions**:
   - It covers the problems and solutions related to the Mahalwari and Ryotwari systems (06:02 - 06:06).

5. **Crops for Europe**:
   - The video mention

# Translation

In [104]:
from transformers import MarianMTModel, MarianTokenizer
import re

# Function to perform translation using MarianMT
def translate_text(text, src_lang, tgt_lang):
    # Define the model name based on source and target languages
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'

    # Load tokenizer and model
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Split text into individual lines for preserving formatting
    lines = text.split('\n')
    translated_lines = []

    for line in lines:
        # Translate non-empty lines
        if line.strip():
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True)
            translated = model.generate(**inputs)
            translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
            translated_lines.append(translated_text)
        else:
            translated_lines.append('')

    # Combine translated lines to preserve formatting
    translated_text = '\n'.join(translated_lines)
    return translated_text

In [105]:
# Example usage with summary text
summary_text = response

src_lang = "en"  # Source language (English)
tgt_lang = "hi"  # Target language (Hindi)

# Translate the summary
translated_summary = translate_text(summary_text, src_lang, tgt_lang)

# Output the translated summary
print("Translated Summary:\n", translated_summary)


Translated Summary:
 ** सारांश: देश की ओर ले जा रहा है

** शब्दकोश तथा ऐतिहासिक संदर्भ:
यह अध्याय ब्रिटिश पूर्व इंडिया कंपनी के नियंत्रण और बैंगनी, ब्के, और ब्वना के युद्ध के बाद है। इस अवधि में विशाल शक्ति में एक महत्वपूर्ण बदलाव चिह्नित किया गया।

** कंपनी एक डिकमन बन जाती है:
सन्‌ 1765 में, रॉबर्ट सी.

कंपनी के लिए ** स्वचालित बनाने वाला:
कंपनी का प्राथमिक फोकस राजस्व का संग्रह था, जो मुख्यतः कृषि क्षेत्र से निकाला गया था। इस राजस्व धारा कंपनी के संचालन और विकास के लिए महत्वपूर्ण था।

** ट्रामम तथा समाधान: महली तथा रयोटाई सिस्टम:
कंपनी ने राज - घर के निर्माण काम में आनेवाली चुनौतियों का सामना किया, जो कि महायुद्धी और रयोटिक व्यवस्थाओं के कार्यान्वयन तक चलती हैं ।

** यूरोप के लिए स्विच: इंडीगो:
इस क्षेत्र में विकसित होनेवाली एक प्रमुख फसल थी, जो यूरोप में उच्च माँग में थी ।

** ब्लू रीबेशन तथा इसके बाद: वैकल्पिक
किसानों और किसानों का शोषण, और अत्याचार राजस्व संग्रह अभ्यासों के साथ, ब्लू रिबेशन की ओर ले गया. इस विद्रोह ने कंपनी के शासन के खिलाफ निराशा और विरोध को विशिष्ट किया.

** गण