In [1]:
import os
print("ok!")

ok!


In [2]:
# for repo
from git import Repo
# for context-aware-splitting
from langchain.text_splitter import Language
# loader pertaining to load github repos
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser

# std stuff for chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Embedding step requirement for embs
from langchain.embeddings.openai import OpenAIEmbeddings
# for vectorstore
from langchain.vectorstores import Chroma
# GPT-3.5 Turbo
from langchain.chat_models import ChatOpenAI

# To sustain chat memory, chain is required when impelementing memory
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

### Clone the repo and store in a given location

In [3]:
%pwd

'/home/prateek/RepoAnalyser/CodeSnippetAnalyser/research'

In [5]:
!make_dir test_repo/
target_path = "./test_repo"

from git import Repo

Repo.clone_from(url = "https://github.com/prtk1729/ECommerceBot", to_path=target_path)


<git.repo.base.Repo '/home/prateek/RepoAnalyser/CodeSnippetAnalyser/research/test_repo/.git'>

### Generic Document Loader and Language Parser

In [5]:
# parser is set, to understand the text syntaxes of a given language
target_path = "./test_repo"
loader = GenericLoader.from_filesystem(
                                        target_path+"/ecombot",
                                        glob = "**/*",
                                        suffixes = [".py"], # multiple language in codebase here I can put other suffixes
                                        parser = LanguageParser(language = Language.PYTHON, 
                                                                parser_threshold = 500) # min num oif tokens >= 500 to qualify as codebase 
                                        )

In [7]:
documents = loader.load()

In [9]:
print( documents )
# since 4 py files, 4 Document-elements in the list
len(documents)

[Document(page_content='import pandas as pd\nfrom langchain_core.documents import Document # AstraDB and langchain \n#require this format -> COnvert to this Document format\n\n\ndef create_documents(list_of_dicts):\n    \'\'\'\n        Returns a list of Document-objects\n    \'\'\'\n    ret = []\n    for le in list_of_dicts:    \n        metadata = {"product_name": le["product_name"]}\n        page_content = le["review"]\n        ret.append( Document( page_content=page_content, metadata=metadata ) )\n    return ret\n\n\ndef data_converter():\n    df = pd.read_csv("../data/flipkart_product_review.csv")\n    # print( df.head() )\n    data = df[ ["product_title", "review"] ]\n\n    list_of_dict = []\n    for index, row in data.iterrows():\n        obj = {\n                "product_name": row["product_title"],\n                "review": row["review"] \n              }\n        list_of_dict.append(obj)\n\n    docs = create_documents(list_of_dict)\n    return docs\n\n\n\nif __name__ == "__ma

4

### Context-Aware Splitting
- Basically, we tag chunks of same functions
- Align the chunks along with the tag-names as function-names

In [10]:
doc_splitter = RecursiveCharacterTextSplitter.from_language( language = Language.PYTHON,\
                                                            chunk_size = 2000, # depends on window-size of loaded LLM
                                                            chunk_overlap = 200
                                                           )

In [11]:
texts = doc_splitter.split_documents(documents)

In [14]:
print(len( texts ))
print(texts)

3
[Document(page_content='import pandas as pd\nfrom langchain_core.documents import Document # AstraDB and langchain \n#require this format -> COnvert to this Document format\n\n\ndef create_documents(list_of_dicts):\n    \'\'\'\n        Returns a list of Document-objects\n    \'\'\'\n    ret = []\n    for le in list_of_dicts:    \n        metadata = {"product_name": le["product_name"]}\n        page_content = le["review"]\n        ret.append( Document( page_content=page_content, metadata=metadata ) )\n    return ret\n\n\ndef data_converter():\n    df = pd.read_csv("../data/flipkart_product_review.csv")\n    # print( df.head() )\n    data = df[ ["product_title", "review"] ]\n\n    list_of_dict = []\n    for index, row in data.iterrows():\n        obj = {\n                "product_name": row["product_title"],\n                "review": row["review"] \n              }\n        list_of_dict.append(obj)\n\n    docs = create_documents(list_of_dict)\n    return docs\n\n\n\nif __name__ == "__

### Embedding Model

In [15]:
os.environ["OPENAI_API_KEY"] = "xxxxx"

In [17]:
embeddings = OpenAIEmbeddings( disallowed_special=() ) # ignore special chars in codebase

### Knowledge Base (VectorDB)

In [20]:
vectordb = Chroma.from_documents( documents=texts, # cas -> list_of_docs
                      embedding= embeddings, # embedding object
                       persist_directory="./data" ) # creates this data directory to store the vectors

# create the local db, inside persist_dir we can find the embs
vectordb.persist() 

### Setup the LLM

In [21]:
# llm = ChatOpenAI(model_name='gpt-4') # Not using as it will charge a lot
llm = ChatOpenAI() # default i.e 3.5 turbo


### Store the history of the chat
- Creates a buffer memory

In [22]:
memory = ConversationSummaryMemory( llm = llm, \
                                   memory_key="chat_history",
                                   return_messages=True)

### QA Wrapper
- retriver is the vectordb's retriever
- search_kwargs -> 3 : To give 3 line result

In [23]:
qa = ConversationalRetrievalChain.from_llm( llm = llm,\
                                           retriever=vectordb.as_retriever(search_type="mmr",
                                                                           search_kwargs={"k": 3}),
                                            memory=memory )

### QnA

In [26]:
question = "Elaborate more on the functionality of generation function, line by line?"

In [27]:
result = qa(question)
print(result['answer'])

Number of requested results 20 is greater than number of elements in index 9, updating n_results = 9


Sure! Let's break down the functionality of the `generation` function line by line:

1. The function starts by defining a retriever using the vector store (`vstore`) to retrieve relevant information based on the search parameters provided.

2. It then sets up a template for the product bot chat prompt. The template includes a specific context, the user's question, and a placeholder for the bot's answer.

3. It initializes an instance of the OpenAI language model for chat responses.

4. It creates a chain of operations that involve retrieving context and question, passing them through the prompt template, feeding them into the OpenAI chat model, and finally parsing the output as a string.

5. The function returns the chain of operations that can be invoked to generate responses to user queries related to product recommendations and customer queries.

In summary, the `generation` function sets up a pipeline for processing user queries related to product recommendations using a vector sto