In [1]:
import os
import wget
import gradio as gr

# To recognize src as a package
import sys
sys.path.append(os.path.abspath('../src'))
from RAG.system import RAGSystem

from hyperparameters import light
from dotenv import load_dotenv
from tqdm import tqdm

GPU detected: mps


In [2]:
load_dotenv()
DATA_DIR = os.getenv("DATA_DIR")
assert DATA_DIR is not None, "Define DATA_DIR in .env file!"

In [3]:
custom_hyper = {
    **light,
    'generator_llm_model_name': 'Mistral 7B CapybaraHermes-2.5 Q4_K_M'
}

In [4]:
rag = RAGSystem(
    hypers=custom_hyper,
    data_dir=DATA_DIR,
    random_seed=17,
)

Seed value set: 17
Collection name: lecture-videos-M4mtwJE_M8Ru4jtEgoGa-sx1hr11b3xtx4SYVtECaFM


<All keys matched successfully>


Initalized embedding model: nomic-ai/nomic-embed-text-v1
Initializing ASR and OCR cache...


In [5]:
videos_to_download = [
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec01_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '01 - Lec 1 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec02_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '02 - Lec 2 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec03_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '03 - Lec 3 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec04_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '04 - Lec 4 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec05_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '05 - Lec 5 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec06_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '06 - Lec 6 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec07_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '07 - Lec 7 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec08_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '08 - Lec 8 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec09_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '09 - Lec 9 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec10_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '10 - Lec 10 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec11_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '11 - Lec 11 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec12_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '12 - Lec 12 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec13_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '13 - Lec 13 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec14_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '14 - Lec 14 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec15_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '15 - Lec 15 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec16_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '16 - Lec 16 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec17_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '17 - Lec 17 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec18_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '18 - Lec 18 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec19_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '19 - Lec 19 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec20_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '20 - Lec 20 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec21_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '21 - Lec 21 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec22_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '22 - Lec 22 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec23_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '23 - Lec 23 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
    {
        'url': 'http://www.archive.org/download/MIT9.00SCF11/MIT9_00SCF11_lec24_300k.mp4',
        'directory': 'MIT9.00SCF11',
        'filename': '24 - Lec 24 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4'
    },
]

In [6]:
VIDEO_DIR_NAME = 'videos'

In [7]:
for video in (pbar := tqdm(videos_to_download)):
    video_folder = os.path.join(DATA_DIR, VIDEO_DIR_NAME, video['directory'])
    if not os.path.exists(video_folder):
        os.makedirs(video_folder)
    
    if not os.path.exists(os.path.join(video_folder, video['filename'])):
        pbar.set_description(f"Downloading {video['filename']} to {video_folder}")
        wget.download(video['url'], out=os.path.join(video_folder, video['filename']))

100%|██████████| 24/24 [00:00<00:00, 26701.14it/s]


In [8]:
videos_to_preprocess = list()
for folder in os.listdir(os.path.join(DATA_DIR, VIDEO_DIR_NAME)):
    if not os.path.isdir(os.path.join(DATA_DIR, VIDEO_DIR_NAME, folder)):
        continue

    for video in os.listdir(os.path.join(DATA_DIR, VIDEO_DIR_NAME, folder)):
        if not video.endswith('.mp4'):
            continue
        
        video_path = os.path.join(DATA_DIR, VIDEO_DIR_NAME, folder, video)
        unique_video_name = os.path.join(folder, video)
        videos_to_preprocess.append({
            'unique_video_name': unique_video_name,
            'video_path': video_path,
        })

print(f"Videos to preprocess: {len(videos_to_preprocess)}\n")

Videos to preprocess: 84



In [9]:
for video in (pbar := tqdm(videos_to_preprocess)):
    pbar.set_description(f"Preprocessing {video['unique_video_name']}")
    rag.preprocess_video(video['unique_video_name'], video['video_path'])

Preprocessing MIT9.00SCF11/21 - Lec 21 _ MIT 9.00SC Introduction to Psychology, Spring 2011.mp4: 100%|██████████| 84/84 [00:00<00:00, 241.56it/s]                       


In [10]:
def call_rag(question, force_answer=False, distance_threshold=0.5):
    response = ""
    for token in rag.ask(
        question=question,
        force_answer=force_answer,
        distance_threshold=distance_threshold,
        stream=True,
        context_detail_top_k=3,
    ):
        response += token
        yield response

gr.Interface(
    fn=call_rag,
    inputs=[
        gr.Textbox(lines=2, placeholder="Type your question here...", autofocus=True, label="Question"),
        gr.Checkbox(label="Force Answer"),
        gr.Number(value=0.5, label="Distance Threshold")
    ],
    outputs=[
        gr.Textbox(lines=10, label="Answer")
    ],
    title="Lecture Video Question Answering using Retrieval-Augmented Generation",
    allow_flagging='never',
    examples=[
        ["Who was Freud?", False, 0.5],
        ["""Experiments have shown that which manipulations can enhance one's feelings or positive actions towards another person?
A) holding a warm drink relative to a cold drink just before evaluating another person
B) not thinking about money
C) rotating from table to table at a speed dating event
D) all of the above""", False, 0.5],
        ["""The number of items that can be held in short-term memory is typically conceptualized as:
A) 3 plus or minus 2
B) 5 plus or minus 2
C) 7 plus or minus 2
D) 9 plus or minus 2""", False, 0.5],
        ["What is thermodynamics?", False, 0.5],
        ["What is Ovomaltine?", False, 0.5],
    ],
).launch(
    share=False,
    inline=False,
)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [None]:
# from qa_datasets import tuning_dataset
# import json

# psychology_course_questions = list(filter(lambda qa: qa['course'] ==
#                                           'MIT 9.00SC Introduction to Psychology, Fall 2011', tuning_dataset))

# print(f"Psychology course questions: {len(psychology_course_questions)}\n")
# question = psychology_course_questions[4]
# print(f"First question: {json.dumps(question, indent=2)}")

# for question in psychology_course_questions:
#     print(f"Question: {question['question']}\n")
#     print(f"Ground truth: {question['ground_truth']}\n")
#     for token in rag.ask(question['question'], stream=True):
#         print(token, end='', flush=True)
#     print("\n" + "-" * 100 + "\n")