In [None]:
!pip -q install langchain openai chromadb tiktoken sentence_transformers langchainhub lark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m502.4/502.4 kB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.6/111.6 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.2/178.2 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.9 M

In [None]:
import os
import json
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.schema import Document

In [None]:
# We'll be using GPT-3.5 Turbo for inference
os.environ['OPENAI_API_KEY'] = ""

### EN
# 1 - Process book dataset into Langchain Documents

We start by fetching a dataset that contains books with some metadata and plot summaries and turn that into Langchain Document objects.

### TR
# 1 - Kitap veri setini Langchain Belgelerine dönüştür

Kitaplar, bazı meta veriler ve özetler içeren bir veri setini alarak bunu Langchain Belge nesnelerine dönüştürüyoruz.


![picture](https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/5.3%20-%20RAG/books.png)

In [None]:
!wget https://github.com/kyuz0/llm-chronicles/raw/main/datasets/books.json

--2023-12-06 12:39:56--  https://github.com/kyuz0/llm-chronicles/raw/main/datasets/books.json
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/datasets/books.json [following]
--2023-12-06 12:39:57--  https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/datasets/books.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11949 (12K) [text/plain]
Saving to: ‘books.json’


2023-12-06 12:39:57 (123 MB/s) - ‘books.json’ saved [11949/11949]



### EN
We'll process each episode and load it into a Langchain Document object (https://js.langchain.com/docs/modules/data_connection/document_loaders/how_to/creating_documents). This object has two main attributes:

- page_conent: the actual content we want to index and search sematically
- metadata: any associated metadata, in our case title, author, publication daates, genres.

### TR
Her bölümü işleyecek ve Langchain Belge nesnesine yükleyeceğiz (https://js.langchain.com/docs/modules/data_connection/document_loaders/how_to/creating_documents). 
Bu nesnenin iki ana özelliği vardır:

- page_content: Semantik olarak dizine eklemek ve aramak istediğimiz gerçek içerik.
- metadata: İlgili tüm meta veriler, bizim durumumuzda başlık, yazar, yayın tarihleri, türler.

In [None]:
def load_documents(file_path):
    with open(file_path, 'r') as file:
        books = json.load(file)

    documents = []
    for book in books:
        page_content = book['plot_summary']
        metadata = {
            'title': book['title'],
            'author': book['author'],
            'genre': book['genre'],
            'publication_date': book['publication_date']
        }
        documents.append(Document(page_content=page_content, metadata=metadata))

    return documents

# Usage example
file_path = 'books.json'
docs = load_documents(file_path)
len(docs)


20

In [None]:
docs[0].metadata

{'title': 'Pride and Prejudice',
 'author': 'Jane Austen',
 'genre': 'Classic Romance',
 'publication_date': '1813'}

### EN
# 2 - Embed and load into vector store

We embed the documents using BGE and load them directly into the Chroma. *Notice that here we are not chuking the documents, as the plot summaries are aready quite short.*

### TR
# 2 - Vektör deposuna gömme ve yükleme

Belgeleri BGE kullanarak gömüyor ve doğrudan Chroma'ya yüklüyoruz. 
*Burada belgeleri parçalara ayırmadığımıza dikkat edin, çünkü özetler zaten oldukça kısa.*


In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.1k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
vectorstore = Chroma.from_documents(docs, bge_embeddings)

### EN
# 3 - Self-Query Retriever

*A self-querying retriever is one that, as the name suggests, has the ability to query itself. Specifically, given any natural language query, the retriever uses a query-constructing LLM chain to write a structured query and then applies that structured query to its underlying VectorStore.*

### TR
# 3 - Kendi Sorgusunu Yapan Retriever

*Kendi sorgusunu yapan bir retriever, adından da anlaşılacağı gibi, kendini sorgulama yeteneğine sahip bir retriever'dır. 
Özellikle, herhangi bir doğal dil sorgusu verildiğinde, retriever bir sorgu oluşturma LLM zinciri kullanarak yapılandırılmış bir sorgu oluşturur ve ardından bu yapılandırılmış sorguyu temelindeki VectorStore'a uygular.*

https://python.langchain.com/docs/modules/data_connection/retrievers/self_query/



![picture](https://raw.githubusercontent.com/kyuz0/llm-chronicles/main/5.3%20-%20RAG/self-query.png)

In [None]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="title",
        description="The title of the book",
        type="string",
    ),
    AttributeInfo(
        name="publication_date",
        description="The date the book was first published",
        type="integer",
    ),
    AttributeInfo(
        name="genre",
        description="The genre of the book. One of ['Classic Romance', 'Dystopian Fiction', 'Southern Gothic, Bildungsroman', 'Tragedy, Social Commentary', 'Adventure, Epic', 'Gothic Novel, Bildungsroman', 'Political Satire', 'Gothic Novel, Tragedy', 'Picaresque Novel', 'Gothic Novel, Science Fiction', 'Adventure Fiction', 'Modernist Fiction', 'Philosophical Fiction', 'Coming-of-Age Fiction', 'Realist Fiction', 'Historical Fiction', 'Symbolist Literature']",
        type="string",https://colab.research.google.com/drive/10yjlRUpJGWzOWE75lY81lkKiaDXcvenA?usp=sharing
    ),
    AttributeInfo(
        name="author",
        description="The author of the book.",
        type="string",
    )
]

document_content_description = "Brief summary of a book plot"
llm = ChatOpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [None]:
retriever.invoke("I want to read a book published in 1859")

[Document(page_content='Set in London and Paris before and during the French Revolution, the novel tells the story of the French Doctor Manette, his 18-year-long imprisonment in the Bastille in Paris, and his release to live in London with his daughter Lucie, whom he had never met. The story is set against the conditions that led up to the French Revolution and the Reign of Terror.', metadata={'author': 'Charles Dickens', 'genre': 'Historical Fiction', 'publication_date': '1859', 'title': 'A Tale of Two Cities'})]

In [None]:
from langchain.globals import set_verbose, set_debug

set_debug(True)
retriever.invoke("I want to read a book published in 1859")

[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "query": "I want to read a book published in 1859"
}
[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:RunnableSequence > 3:prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "I want to read a book published in 1859"
}
[36;1m[1;3m[chain/end][0m [1m[1:retriever:Retriever > 2:chain:RunnableSequence > 3:prompt:FewShotPromptTemplate] [1ms] Exiting Prompt run with output:
[0m{
  "lc": 1,
  "type": "constructor",
  "id": [
    "langchain_core",
    "prompt_values",
    "StringPromptValue"
  ],
  "kwargs": {
    "text": "Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    \"query\": string \\ text string to compare to document contents

[Document(page_content='Set in London and Paris before and during the French Revolution, the novel tells the story of the French Doctor Manette, his 18-year-long imprisonment in the Bastille in Paris, and his release to live in London with his daughter Lucie, whom he had never met. The story is set against the conditions that led up to the French Revolution and the Reign of Terror.', metadata={'author': 'Charles Dickens', 'genre': 'Historical Fiction', 'publication_date': '1859', 'title': 'A Tale of Two Cities'})]

In [None]:
retriever.invoke("I want to read books with animals")

[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "query": "I want to read books with animals"
}
[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:RunnableSequence > 3:prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "I want to read books with animals"
}
[36;1m[1;3m[chain/end][0m [1m[1:retriever:Retriever > 2:chain:RunnableSequence > 3:prompt:FewShotPromptTemplate] [1ms] Exiting Prompt run with output:
[0m{
  "lc": 1,
  "type": "constructor",
  "id": [
    "langchain_core",
    "prompt_values",
    "StringPromptValue"
  ],
  "kwargs": {
    "text": "Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    \"query\": string \\ text string to compare to document contents\n    \"filt

[Document(page_content='This novel tells the story of a dog named Buck who is stolen from his home and sold into the brutal existence of an Alaskan sled dog. He progressively reverts to a wild state in the harsh Arctic environment. The novel explores themes of survival and the clash between civilization and the natural world.', metadata={'author': 'Jack London', 'genre': 'Adventure Fiction', 'publication_date': '1903', 'title': 'The Call of the Wild'}),
 Document(page_content="This allegorical novella reflects events leading up to and during the Stalin era before World War II. 'Animal Farm' tells the story of a group of farm animals who rebel against their human farmer, hoping to create a society where the animals can be equal, free, and happy. Ultimately, however, the rebellion is betrayed, and the farm ends up in a state as bad as it was before, under the dictatorship of a pig named Napoleon.", metadata={'author': 'George Orwell', 'genre': 'Political Satire', 'publication_date': '194

In [None]:
retriever.invoke("I want to read books about young people going on adventures")

[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "query": "I want to read books about young people going on adventures"
}
[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:RunnableSequence > 3:prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "I want to read books about young people going on adventures"
}
[36;1m[1;3m[chain/end][0m [1m[1:retriever:Retriever > 2:chain:RunnableSequence > 3:prompt:FewShotPromptTemplate] [1ms] Exiting Prompt run with output:
[0m{
  "lc": 1,
  "type": "constructor",
  "id": [
    "langchain_core",
    "prompt_values",
    "StringPromptValue"
  ],
  "kwargs": {
    "text": "Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    \"query\": string \\ tex

[Document(page_content='This novel is about the adventures of a young boy, Huck, and a runaway slave, Jim, as they travel down the Mississippi River on a raft. Throughout their journey, they encounter various characters and situations that satirize the Antebellum South, slavery, and Southern antebellum society. The novel is noted for its colorful description of people and places along the Mississippi River and its sober, often critical look at entrenched attitudes, particularly racism.', metadata={'author': 'Mark Twain', 'genre': 'Picaresque Novel', 'publication_date': '1884', 'title': 'The Adventures of Huckleberry Finn'}),
 Document(page_content="Narrated by a young man named Holden Caulfield, this novel details his life in New York City after being expelled from prep school. The narrative follows Holden's experiences and interactions over a few days, exploring themes of adolescent angst, alienation, and the loss of innocence.", metadata={'author': 'J.D. Salinger', 'genre': 'Coming-o