In [10]:
import yt_dlp

def get_video_ids_from_playlist(playlist_url):
    ydl_opts = {
        'quiet': True,
        'extract_flat': True,
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        result = ydl.extract_info(playlist_url, download=False)
        
        if 'entries' in result:
            video_info = [{
                'id': entry['id'],
                'title': entry['title'],
                'url': f"https://www.youtube.com/watch?v={entry['id']}"
            } for entry in result['entries']]
            return video_info
        else:
            print("Not a valid playlist URL.")
            return []

playlist_url = "https://www.youtube.com/playlist?list=PLYxtGyYUCbEGk0TkTw6iLZrYObQ-OGqCR"
video_info = get_video_ids_from_playlist(playlist_url)

In [30]:
import re
from tqdm import tqdm
from langchain.schema import Document
from youtube_transcript_api import YouTubeTranscriptApi

failed = []

def clean_text(text):
    cleaned_text = text.encode("utf-8", "ignore").decode("utf-8")
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_text)

    return cleaned_text

def download_transcript(video_id):
    try: 
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    except:
        failed.append(video_id)
        pass
    text = " ".join([t["text"] for t in transcript])
    text = clean_text(text)
    return text

print("Downloading and converting transcripts to documents: ")

documents = []
for vi in tqdm(video_info):
    vid, title, url = vi.values()
    try:
        transcript = download_transcript(vid)
    except:
        continue
    metadata = {"source": title, "url": url}
    document = Document(page_content=transcript, metadata=metadata)
    documents.append(document)

print("Done!")
print("Failed video ids: ", failed)

Downloading and converting transcripts to documents: 


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 159/159 [02:47<00:00,  1.05s/it]

Done!
Failed video ids:  ['eA2uA0D8w_I', 'VVFhdR0e4AE', '7-23_WfZ4Lk', 'UHw75Nu-BUQ', 'RkXOnq10G-s', 'ASh_9dRnmv4']





In [50]:
import faiss
from uuid import uuid4
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len,
    is_separator_regex=False
)

print("Splitting and storing documents: ")

for doc in tqdm(documents):
    chunks = text_splitter.split_text(doc.page_content)
    batch = []
    for chunk in chunks:
        doc = Document(chunk, metadata=doc.metadata)
        batch.append(doc)

    ids = [uuid4() for i in range(len(batch))]
    vector_store.add_documents(batch, ids=ids)

print("done")

Splitting and storing documents: 


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [2:30:03<00:00, 58.85s/it]

done





In [51]:
vector_store.save_local("faiss_index")