In [1]:
import openai 
import tiktoken 
import chromadb 
import langchain

In [2]:
from langchain.custom_text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser

In [3]:
# Ropo Path
repo_path = "./zepo"

In [4]:
# Load
loader = GenericLoader.from_filesystem(
    repo_path,
    glob="**/*",
    suffixes=[],
    parser=LanguageParser(parser_threshold=500)
)
documents = loader.load()
len(documents)

8

In [7]:
from langchain.custom_text_splitter import RecursiveCharacterTextSplitter

In [5]:
#Split 
from langchain.custom_text_splitter import RecursiveCharacterTextSplitter
python_splitter = RecursiveCharacterTextSplitter.from_language( language=Language.SQL, 
                                                               chunk_size=2000, 
                                                               chunk_overlap=200)
texts = python_splitter.split_documents(documents)
len(texts)

154

In [6]:
#Persist Directory
persist_directory = 'C:\ProjectLFG\PersistedEmbeddings'

In [7]:
#Creating the Embeddings
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings #Can use any other like Huggingface's embedding

embedding = OpenAIEmbeddings(disallowed_special=())
db = Chroma.from_documents(documents=texts, 
                           embedding=embedding,
                           persist_directory=persist_directory)
#Created Embeddings in db successfully

In [8]:
# persiste the embedding DB to disk then delete it
db.persist()
db = None

In [9]:
# Now we can load the persisted database from disk, and use it as normal. 
db = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

In [10]:
#Setting up Retriever
retriever = db.as_retriever(
    search_type="mmr", # Also test "similarity"
    search_kwargs={"k": 8}, # Number of relevant document to return
)

In [11]:
question = '''Get me list of all tables from which "#AllProductTransactionsNoCalculation" table is created'''
docs = retriever.get_relevant_documents(question)
docs

[Document(page_content="IF @OpportunityLicenseProgram = 'OSSA'\n\t\t\tBEGIN\n\t\t\t\tINSERT INTO #AllProductTransactionsNoCalculation\n\t\t\t\tSELECT TP.[OpportunityKey]\n\t\t\t\t, VT.[SubscriptionID]\n\t\t\t\t, TP.[TenantID]\n\t\t\t\t, TP.[AgreementID]\n\t\t\t\t, TP.[OrgID]\n\t\t\t\t, TP.[ProductID]\n\t\t\t\t, TP.[ProductKey]\n\t\t\t\t, VT.[SalesDate]\n\t\t\t\t, VT.[OriginalPurchaseAmount]\n\t\t\t\t, VT.[LicenseTransactionItemId]\n\t\t\t\t, VT.[RevSumCategoryID]\n\t\t\t\t, OP.[ExtendedAmount]\n\t\t\t\tFROM (\n\t\t\t\t\tSELECT DISTINCT [ProductID]\n\t\t\t\t\t\t, [OpportunityKey]\n\t\t\t\t\t\t, [SubscriptionID]\n\t\t\t\t\t\t, [TenantID]\n\t\t\t\t\t\t, [AgreementID]\n\t\t\t\t\t\t, [OrgID]\n\t\t\t\t\t\t, [ProductKey]\n\t\t\t\t\tFROM #CurrentMOSPOpportunities OPPTY\n\t\t\t\t\tWHERE [Priority] = @CurrentPriority\n\t\t\t\t\t) TP\n\t\t\t\tINNER JOIN #AllValidTransactions VT\n\t\t\t\t\tON VT.[ProductID] = TP.[ProductID]\n\t\t\t\t\t\tAND (\n\t\t\t\t\t\t\tVT.[SubscriptionID] = TP.[SubscriptionID

In [32]:
f = open("test-splits.sql", "w")

In [33]:

#     for i in 10:
#         f.writelines("\n")

f.write( '\n'.join(' '.join(str(x) for x in t) for t in texts) )

261843

In [None]:
#Retrieve
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
db = Chroma.from_documents(texts, OpenAIEmbeddings(disallowed_special=()))
retriever = db.as_retriever(
    search_type="mmr", # Also test "similarity"
    search_kwargs={"k": 8}, #search of different chunks for our query
)

In [None]:
# Get Relevant documents
question = '''List all the subdependent tables for the table #AllProductTransactionsNoCalculation'''
docs = retriever.get_relevant_documents(question)

In [None]:
docs