In [None]:
#videos obtained using yt-dlp (https://github.com/yt-dlp/yt-dlp#output-template-examples) in Ubuntu cmd line:
# Download YouTube playlist videos in separate directory indexed by video order in a playlist
#yt-dlp -o "%PhysicsVideos/%(playlist_index)s%(ext)s" "https://www.youtube.com/playlist?list=PL6i60qoDQhQGaGbbg-4aSwXJvxOqO6o5e"

In [3]:
import whisper
import os
import numpy as np
import itertools
import json
from pathlib import Path
from dotenv import load_dotenv
import pinecone
from collections import defaultdict
from sentence_transformers import SentenceTransformer

load_dotenv()

  def backtrace(trace: np.ndarray):
  from tqdm.autonotebook import tqdm


True

In [17]:
video_meta=defaultdict(dict)
with open("video_meta.txt","r") as f:
    for line in f:
        data = line.strip().split("\\")
        video_meta[data[0]] = {"title":data[1],"url":data[2]}

defaultdict(<class 'dict'>, {'001': {'title': 'seperLecture 1 ｜ Modern Physics： Classical Mechanics (Stanford)', 'url': 'seperpyX8kQ-JzHI'}, '002': {'title': 'seperLecture 2 ｜ Modern Physics： Classical Mechanics (Stanford)', 'url': 'seperh96SW0PfQcg'}, '003': {'title': 'seperLecture 3 ｜ Modern Physics： Classical Mechanics (Stanford)', 'url': 'seper3YARPNZrcIY'}, '004': {'title': 'seperLecture 4 ｜ Modern Physics： Classical Mechanics (Stanford)', 'url': 'seperFZDy_Dccv4s'}, '005': {'title': 'seperLecture 5 ｜ Modern Physics： Classical Mechanics (Stanford)', 'url': 'seperZpRpI0D7P7Y'}, '006': {'title': 'seperLecture 6 ｜ Modern Physics： Classical Mechanics (Stanford)', 'url': 'seper14Yhzbn96Bc'}, '007': {'title': 'seperLecture 7 ｜ Modern Physics： Classical Mechanics (Stanford)', 'url': 'seper0mcxUD53rrM'}, '008': {'title': 'seperLecture 8 ｜ Modern Physics： Classical Mechanics (Stanford)', 'url': 'sepergUUbl444r74'}, '009': {'title': 'seperLecture 9 ｜ Modern Physics： Classical Mechanics (Sta

In [2]:
#Change path to videos directory
videoFolder = Path(os.path.dirname(os.path.abspath('__file__'))).parent.parent / "PhysicsVideos"
model = whisper.load_model("small")

100%|███████████████████████████████████████| 461M/461M [00:41<00:00, 11.7MiB/s]


In [3]:
result = model.transcribe(whisper.load_audio(videoFolder / "001.webm"))

In [5]:
#write to json file as initial save
filename = "whisper_output.json"

with open(filename, 'w') as f:
    json.dump(result, f, indent=4)

In [6]:
segments = result["segments"]
segInfo = [{"start": int(segment['start']),"end": int(segment['end']), "text": segment['text'].split()} for segment in segments]
#remove headers
segInfo = segInfo[2:-2]

#interpolate start-end time per word
for idx,dict in enumerate(segInfo):
    segInfo[idx]["timeStamp"] = list(np.linspace(dict["start"],dict["end"],len(dict["text"]),dtype=int))

#convert to single list of words and timestamps
segInfo = list(map(lambda x:[x["text"],x["timeStamp"]],segInfo))
segInfo = list(zip(*segInfo))
segInfo = list(map(lambda x:list(itertools.chain.from_iterable(x)),segInfo))


In [7]:
#transfrom single list of words into token inputs for sentence transformer
SENTENCE_TRANSFORMER_INPUT_TOKEN_LENGTH=128

def endIdx(startIdx:int) -> int:
    return min(startIdx + SENTENCE_TRANSFORMER_INPUT_TOKEN_LENGTH,len(segInfo[0]))

def wordsToToken(idx:int) -> str:
    return " ".join(segInfo[0][idx:endIdx(idx)])

tokenInput = [{"text": wordsToToken(i), "start": segInfo[1][i], "end": segInfo[1][endIdx(i)-1]} for i in range(0, len(segInfo[0]), SENTENCE_TRANSFORMER_INPUT_TOKEN_LENGTH)]


[{'text': "Classical mechanics is the basis for all of physics. It's the basis of all of physics, not only because it describes the motion of objects like particles and mechanical systems and so forth, but because the basic framework, the basic structure of all of physics is based on the principles of classical mechanics. The conservation of energy, the conservation of momentum, the principles by which all systems evolve in nature is the same set of rules, essentially exactly the same set of rules in a more abstract and a more general setting than the rules which govern how a simple particle moves, for example, under the influence of gravity. But in order to understand it, we have to understand the principles in a fairly general context. Let's begin", 'start': 13, 'end': 70}, {'text': "with the very, very simplest kinds of systems that we can think of. Systems that are so simple that, in fact, they're simpler than any real systems in nature. Laws of nature, let's imagine laws of nature

In [7]:
#load in sentence transformer (transforms video text transcripts to vector embeddings)
retriever = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')
embed_dim = retriever.get_sentence_embedding_dimension()

In [4]:
#requires a pinecone account (https://www.pinecone.io/) and API key to run (e.g. create .env file in root directory with KEY and ENV variables)
pinecone.init(api_key=os.getenv("KEY"),
                environment=os.getenv("ENV"))

#create the index (only run once)
# pinecone.create_index(
#     "youtube-search",
#     dimension=embed_dim,
#     metric="cosine"
# )

#connect to the index
index = pinecone.Index("youtube-search")

In [23]:
BATCH_SIZE = 64
vid_id="001"
Total_package,filename = [],"pinecone_upserts.json"

#process single list of words and timestamps into batches of token inputs
for i in range(0,len(tokenInput),BATCH_SIZE):
    batch = tokenInput[i:min(i+BATCH_SIZE,len(tokenInput))]

    ids = list(map(lambda x:str(vid_id)+"-"+str(i)+"-"+str(x["start"]), batch))
    #get embeddings using the sentence transformer
    embeddings = retriever.encode(list(map(lambda x:x["text"],batch))).tolist()
    metadata = list(map(lambda x:{"text":x["text"],"title":video_meta[vid_id]["title"],"url":video_meta[vid_id]["url"],"start":int(x["start"]),"end":int(x["end"])},batch))

    package = list(zip(ids,embeddings,metadata))
    package = list(map(lambda x:{'id':x[0],'values':x[1],'metadata':x[2]},package))

    #insert embeddings together with metadata into pinecone index
    index.upsert(vectors=package)
    Total_package.append(package)

with open(filename, 'w') as f:
    json.dump(Total_package, f, indent=4)

index.describe_index_stats()
    

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 54}},
 'total_vector_count': 54}

In [18]:
query = "Who discovered the Higgs Boson?"

xq = retriever.encode(query).tolist()

In [19]:
xc = index.query(xq, top_k=5,
                 include_metadata=True)
for context in xc['matches']:
    print(context['metadata'], end="\n---\n")

{'end': 3643.0, 'start': 3602.0, 'text': "them in the laboratory, we want to read the diagram from right to left, and we want to say this is a process whereby a pair of electrons can come together and make a higgs boson. We've been colliding electrons and positrons for a long, long time, almost as long as I've been a physicist, not quite. We've been colliding electrons and positrons together, and nobody was ever able to discover the higgs. Now, one reason in the early days is it turns out that the higgs is a fairly heavy particle. I will tell you what its mass is, but it's a fairly heavy particle. And unless you have enough energy, you don't have enough energy to make the higgs boson. But there's a more important reason.", 'title': 'Demystifying the Higgs Boson with Leonard Susskind', 'url': 'JqNg819PiZY'}
---
{'end': 3809.0, 'start': 3765.0, 'text': "the mass of a proton, basically, which is heavy. Top and anti-top. Top quarks and anti-quark. So you say, well, look, now it's easy to m

In [5]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 14124}},
 'total_vector_count': 14124}