In [1]:
!pip install -q langchain-text-splitters==0.3.5
!pip install -qU langchain-openai
!pip install -qU "langchain-chroma>=0.1.2"

## 1.Get playlist Subtitle

In [2]:
from ytkit import GetPlaylistSubtitle

we are using this playlist: https://www.youtube.com/watch?v=YHBVjv4MYXE&list=PL8dPuuaLjXtNamNKW5qlS-nKgA0on7Qze

In [8]:
get_playlist_subtlte = GetPlaylistSubtitle()
get_playlist_subtlte.get(playlist_id="PL8dPuuaLjXtNamNKW5qlS-nKgA0on7Qze",
                        save_to_disk="/home/rahul/projects/ytkit/data",
                        language="en")

100%|██████████| 18/18 [00:36<00:00,  2.05s/it]


## 2. Splitting-Text

In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

# Specify the directory
directory = "/home/rahul/projects/ytkit/data"

# Get all files from the directory
files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
texts=[]
for file in files:
    # Load example document
    with open(f"{directory}/{file}") as f:
        text = f.read()
        texts.append(text)

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=400,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents(texts)

In [2]:
texts[:3]


[Document(metadata={}, page_content='You know what business people really like\nto talk about? Money. Profit, revenue, income, assets, cash flow\n-- all these words mean money, but they all have specific uses. In business, money is important to us and\nwe want to describe it as accurately as possible. That can make it confusing for new entrepreneurs'),
 Document(metadata={}, page_content='to talk about the money flowing into their business, and it seems like we need a translator\nfor all the jargon! But, really, making money comes down to understanding\na few basic terms and setting up some sales structures that let customers make purchases\nin a way that works for them. I’m Anna Akana, and this is Crash Course'),
 Document(metadata={}, page_content='Business: Entrepreneurship. [Theme Music Plays] Money can be an awkward subject, I get it. But to make a living, have an impact, and\nbe taken seriously at decision-making tables, we entrepreneurs need to know the ins-and-outs\nof our busi

## 3. Ingesting in vectorDB

In [29]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [30]:
from langchain_chroma import Chroma

vector_store = Chroma(
    embedding_function=embeddings,
)
vector_store.add_documents(documents=texts)

['e9e078a4-6a75-4095-8c0c-04ec41c1b5b5',
 '7611bdbd-8339-4326-b4e4-a5a503635b17',
 '9ea6f577-dd5a-4e03-b57a-f3fc0166d84c',
 'fef4b539-f742-417c-a297-f188178117bc',
 '2bfc76d7-7b14-4479-a067-0249736c7f9f',
 '4ad83d60-1c49-490e-b88f-1ada7bb1e69f',
 'f61f30ea-70fc-4f49-915a-70403dc15f0f',
 '94b5517a-85b9-4013-8026-e93311236090',
 '8e74ff74-91fd-4ceb-89e7-ce1c990f7382',
 'c7009fa1-db3e-47ed-bfbc-d229b9efe446',
 '4c34297d-020d-44e8-a3af-5e0c844d7ccf',
 '6844a715-bf07-40ea-97a7-19d966c3b291',
 'a41d01c0-01dc-4667-8aa7-6f948ff2ebec',
 'c24b574b-c360-45e2-acca-609859e6ba50',
 'c3361503-71a6-4df6-8c54-36e1ef4b3d4d',
 'e73d57a8-1872-4cbe-95f2-c53c79e7281d',
 'd6d096d1-1381-4481-aacc-4a131068c285',
 '7e6c8ded-c9cd-4d14-ac8e-ea3f604f585f',
 'd888b4cc-4f71-4a57-9e87-097bc4a8e3f1',
 '6dd537f3-43d8-44f7-9d62-98d24e71df39',
 '87775317-7f19-43f6-a013-67a725f39a26',
 'be3e699a-6b40-4f37-a83f-5b46daf2c8f0',
 'd2fdb98d-82a2-438f-abd6-efcef92d1b3f',
 'dbe30592-b3fa-49fe-aa22-d5a7c948f77b',
 '3f3a3cf5-8fd6-

In [23]:
query = "how to start a startup"
retriever = vector_store.as_retriever(
            search_type="similarity", search_kwargs={"k": 1}
        )
relevant_docs = retriever.invoke(query)
relevant_docs

[Document(id='dcbfa94f-03e9-4be6-a0f0-db573b6091fa', metadata={}, page_content='on the entrepreneur in charge and what value they’re focused on providing. That being said, here are four solid starting\npoints from the online entrepreneurship magazines Entrepreneur and Inc: One, use familiar language -- not a bunch\nof business jargon -- and empathy. Two, be clear, concise, and timely. Three, delight the customer with originality')]

## 4. Inference

In [21]:
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
model = ChatOpenAI(model="gpt-4o-mini")

In [24]:
from langchain import hub
prompt = hub.pull("empler-ai/rag-prompt")



In [25]:
writer = prompt | model | StrOutputParser()

In [26]:
content=[doc.page_content for doc in relevant_docs]
context = "\n\n".join(content)

In [27]:
answer=writer.invoke({"question":query,"context":context})


In [28]:
answer

'To start a startup, consider these four solid starting points: \n\n1. Use familiar language that resonates with your target audience, avoiding business jargon, and incorporate empathy into your communication.\n2. Be clear, concise, and timely in your messaging and actions.\n3. Focus on delighting the customer by offering something original.\n\nAdditionally, keep in mind the importance of understanding the entrepreneur in charge and the specific value they aim to provide.'