In [None]:
!pip -q install langchain openai chromadb tiktoken sentence_transformers langchainhub

In [None]:
import os
import langchain
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate

In [None]:
# We'll be using GPT-3.5 Turbo for inference
os.environ['OPENAI_API_KEY'] = ""

### EN
# 1 - Process dataset into Langchain Documents

We start by fetching a dataset that contains transcript of the first 20 episodes of the Huberman Lab Podcast on health and fitness.

Each episode is represented as a plantext file, starting with the YouTube URL ofthe episode and the title, that we'll parse as metadata. The actual transcript start after the "TRANSCRIPTS" sparator.

### TR
# 1 - Veri setini Langchain Belgelerine dönüştür

Huberman Lab Podcast'in sağlık ve fitness ile ilgili ilk 20 bölümünün transkriptlerini içeren bir veri setini alarak işe başlıyoruz.

Her bölüm bir düz metin dosyası olarak temsil edilir ve bölümün YouTube URL'si ve başlığı ile başlar. Bu bilgileri meta veri olarak ayrıştıracağız. 
Gerçek transkript, "TRANSCRIPTS" ayırıcısından sonra başlar.


In [None]:
!wget https://github.com/kyuz0/llm-chronicles/raw/main/datasets/huberman-lab-transcripts.tgz
!tar xzf huberman-lab-transcripts.tgz

--2023-12-06 12:55:55--  https://github.com/kyuz0/llm-chronicles/raw/main/datasets/huberman-lab-transcripts.tgz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/datasets/huberman-lab-transcripts.tgz [following]
--2023-12-06 12:55:55--  https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/datasets/huberman-lab-transcripts.tgz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 639359 (624K) [application/octet-stream]
Saving to: ‘huberman-lab-transcripts.tgz’


2023-12-06 12:55:55 (32.8 MB/s) - ‘huberman-lab-transcripts.tgz’ saved [639359/639359]



### EN
We'll process each episode and load it into a Langchain Document object (https://js.langchain.com/docs/modules/data_connection/document_loaders/how_to/creating_documents). This object has two main attributes:

- page_conent: the actual context we want to index and search sematically
- metadata: any associated metadata, in our case title and YouTube url.

### TR
Her bölümü işleyecek ve Langchain Belge nesnesine yükleyeceğiz (https://js.langchain.com/docs/modules/data_connection/document_loaders/how_to/creating_documents). 
Bu nesnenin iki ana özelliği vardır:

- page_content: Semantik olarak dizine eklemek ve aramak istediğimiz gerçek içerik.
- metadata: İlgili tüm meta veriler, bizim durumumuzda başlık ve YouTube URL'si.

In [None]:
def process_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract URL and Title
    url = lines[0].strip()
    title = lines[2].strip()

    # Extract page content after "TRANSCRIPT"
    transcript_index = lines.index('TRANSCRIPT\n')
    page_content = ''.join(lines[transcript_index + 1:])

    return Document(page_content=page_content, metadata={'source': url, 'title': title})


def create_documents_from_directory(directory_path):
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            doc = process_txt_file(os.path.join(directory_path, filename))
            documents.append(doc)
    return documents

# Example usage
directory_path = 'huberman-lab-transcripts'
docs = create_documents_from_directory(directory_path)
len(docs)


20

In [None]:
docs[0].metadata

{'source': 'https://www.youtube.com/watch?v=ntfcfJ28eiU',
 'title': '\ufeffTools for Managing Stress & Anxiety | Huberman Lab Podcast #10'}

In [None]:
docs[0].page_content[:200]

"\n\n  (00:00:00) Introduction\nWelcome to the Huberman Lab Podcast where we discuss science and science-based tools for everyday life I'm Andrew Huberman. And I'm a Professor of Neurobiology and Ophthalm"

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
len(all_splits)

4581

In [None]:
all_splits[1].page_content

"Our first sponsor is InsideTracker. InsideTracker analyzes your blood and DNA to give you an accurate assessment of your health and your biological age. There are many things about our health that can only be analyzed from blood and DNA tests. I've been getting my blood assessed for many years now. And about a year ago, I switched to InsideTracker. What I like about InsideTracker is that you get all this information back about metabolic factors, endocrine factors, et cetera, that are really important to your health. But unlike a lot of blood tests where you just get all the numbers back and it tells you whether or not things are high, normal, or low, InsideTracker also has this really"

### EN
# 3 - Embedding chunks and loading into a vector database

This is a key preparation step for us to be able to perform semantic search on the transcripts.

- **BGE Embeddings**: BGE models on the HuggingFace are among the best performing open-source embedding models. BGE is created by the Beijing Academy of Artificial Intelligence (BAAI)- https://huggingface.co/BAAI/bge-large-en
- **Chroma**: Chroma is an open-source vector database for building AI applications with embeddings. It comes with everything you need to get started built in, and runs on your machine. Check out a more comprehensive list of vector databases here -> https://www.datacamp.com/blog/the-top-5-vector-databases.

### TR
# 3 - Parçaları gömme ve bir vektör veritabanına yükleme

Bu, transkriptler üzerinde semantik arama yapabilmemiz için önemli bir hazırlık adımıdır.

- **BGE Gömüleri**: HuggingFace üzerindeki BGE modelleri, en iyi performans gösteren açık kaynaklı gömme modelleri arasındadır. 
  BGE, Pekin Yapay Zeka Akademisi (BAAI) tarafından oluşturulmuştur - https://huggingface.co/BAAI/bge-large-en
- **Chroma**: Chroma, gömülerle yapay zeka uygulamaları oluşturmak için kullanılan açık kaynaklı bir vektör veritabanıdır. 
  Başlamak için ihtiyacınız olan her şeyi içerir ve makinenizde çalışır. Daha kapsamlı bir vektör veritabanı listesi için buraya göz atabilirsiniz -> https://www.datacamp.com/blog/the-top-5-vector-databases.


![picture](https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/5.3%20-%20RAG/vector-store.png)

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.1k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

# 4 - HyDE
### EN
*Hyde is a retrieval method that stands for Hypothetical Document Embeddings (HyDE). It is a method used to enhance retrieval by generating a hypothetical document for an incoming query.*

### TR
*Hyde, Hipotetik Belge Gömüleri (HyDE) anlamına gelen bir alma yöntemidir. 
Bu yöntem, gelen bir sorgu için hipotetik bir belge üreterek almayı geliştirmek için kullanılır.*

https://arxiv.org/abs/2212.10496

https://github.com/langchain-ai/langchain/tree/master/cookbook

![picture](https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/5.3%20-%20RAG/hyde.png)

In [None]:
llm = OpenAI()
hyde = HypotheticalDocumentEmbedder.from_llm(llm,
                                              bge_embeddings,
                                              prompt_key="web_search")

In [None]:
hyde.llm_chain.prompt

PromptTemplate(input_variables=['QUESTION'], template='Please write a passage to answer the question \nQuestion: {QUESTION}\nPassage:')

In [None]:
langchain.debug = True
result = hyde.embed_query("What are some good ways to increase motivation?")

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please write a passage to answer the question \nQuestion: What are some good ways to increase motivation?\nPassage:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [2.96s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nIncreasing motivation can be a difficult task, but there are several strategies that can help. First, setting attainable goals can be a great way to motivate yourself. Having a goal to work towards can provide a sense of purpose and direction. Secondly, positive affirmations can be an effective way to increase motivation. Positive self-talk can help to build self-confidence and boost morale. Additionally, rewarding yourself for reaching goals can be a great way to stay motivated. Celebrating accomplishments, no matter how small, can help to keep you focused and motivated. Finally, surrounding yourself with supportive people w

### EN
## 5 - Using HyDE Embeddings

### TR
## 5 - HyDE Gömülerini Kullanma

In [None]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=hyde)

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [None]:
retrieved_docs = retriever.get_relevant_documents(
    "What are some good ways to increase motivation?"
)
len(retrieved_docs)

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please write a passage to answer the question \nQuestion: What are some good ways to increase motivation?\nPassage:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [3.40s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " \nIncreasing motivation is an important part of achieving success in any area. There are a few effective ways to increase motivation. One of the most important is to set achievable goals and make a plan for how to achieve them. Breaking big goals into smaller, more manageable steps can help make them more achievable. Additionally, it is important to reward yourself for each step you take towards achieving your goals, no matter how small. This can help to keep you motivated and on track. Another way to increase motivation is to practice visualization. Visualizing success can help to create a positive mindset and keep you focuse

4

In [None]:
retrieved_docs

[Document(page_content="the reward of the accomplishment itself. So be aware these positive reinforcements also. I'm not saying people should flagellate themselves to the point of victory in whatever they're pursuing, but motivation is a tricky one. So I suggest that everyone asks themselves what is it that I want to accomplish? And what is it that's driving me to accomplish this and come up with two or three things. Fear-based perhaps, love-based perhaps or perhaps several of those in order to ensure alertness, energy and attention for the task. And that brings us to the attention part. Now it's one thing to have an electrode embedded into your brain and increase the amount of acetylcholine. It's another to exist", metadata={'source': 'https://www.youtube.com/watch?v=LG53Vxum0as&t=4213s', 'start_index': 50666, 'title': '\ufeffHow to Focus to Change Your Brain | Huberman Lab Podcast #6'}),
 Document(page_content="and we've all heard before of growth mindset this incredible discovery of

### EN
# 4 - Full RAG Chain

Let's now put everything together to build a fully functional RAG chain using Lanchain Expression Language -> https://python.langchain.com/docs/expression_language/.

### TR
# 4 - Tam RAG Zinciri

Şimdi her şeyi bir araya getirerek, tamamen işlevsel bir RAG zinciri oluşturmak için Langchain İfade Dili'ni kullanacağız -> https://python.langchain.com/docs/expression_language/.

![picture](https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/5.3%20-%20RAG/retrieval.png)

In [None]:
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [None]:
rag_chain.invoke("What are some good ways to increase motivation?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What are some good ways to increase motivation?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "input": "What are some good ways to increase motivation?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What are some good ways to increase motivation?"
}
[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please write a passage to answer the question \nQuestion: What are some good ways to increase motivation?\nPassage:"
  ]
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 4:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "What are some good ways to increase mot

'One way to increase motivation is to celebrate your wins, but not every win. By selectively rewarding your good behavior or performance, you can maintain a healthy level of motivation without experiencing a dopamine crash. Additionally, it can be helpful to set intermediate goals and not overly celebrate each one, as this can help avoid diminishing dopamine levels and maintain motivation.'