In [1]:
# For reading credentials from the .env file
import os
from dotenv import load_dotenv
import pandas as pd

from langchain.document_loaders import PyPDFLoader, DataFrameLoader
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter

# WML python SDK
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes, DecodingMethods
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM

# Load API credentials from .env file
load_dotenv()
try:
    API_KEY = os.environ.get("API_KEY")
    project_id = os.environ.get("PROJECT_ID")
except KeyError:
    API_KEY = input("Please enter your WML api key (hit enter): ")
    project_id = input("Please enter your project_id (hit enter): ")

credentials = {
    "url": "https://us-south.ml.cloud.ibm.com",
    "apikey": API_KEY
}


def get_model(model_type, max_tokens, min_tokens, decoding, temperature):

    generate_params = {
        GenParams.MAX_NEW_TOKENS: max_tokens,
        GenParams.MIN_NEW_TOKENS: min_tokens,
        GenParams.DECODING_METHOD: decoding,
        GenParams.TEMPERATURE: temperature
    }

    model = Model(
        model_id=model_type,
        params=generate_params,
        credentials=credentials,
        project_id=project_id
    )

    return model
def get_lang_chain_model(model_type, max_tokens, min_tokens, decoding, temperature):
    base_model = get_model(model_type, max_tokens, min_tokens, decoding, temperature)
    langchain_model = WatsonxLLM(model=base_model)
    return langchain_model

In [2]:
import chromadb
import pandas as pd
# Provide the path relative to the dir in which the script is running
file_path = "../data/output.pkl"
# 1. Load the dataframe
df = pd.read_pickle(file_path)
df.insert(0, "ID", df.index.astype(str))

In [3]:
df.head()

Unnamed: 0,ID,Path,Read,Extension,Content
0,0,./project_old\README.md,YES,md,# Factory Feature.\n\n
1,1,./project_old\src\app.py,YES,py,import os\n\ndef search_files(directory):\n ...


TextLoader class loads the content of some text into a list named documents.


In [52]:
from langchain.document_loaders import TextLoader
loader = TextLoader('text.txt')
documents_txt = loader.load()

In [122]:
documents_txt[0]

Document(page_content='Text Information', metadata={'source': 'some_text.txt'})

In [123]:
type(documents_txt[0])

langchain_core.documents.base.Document

## Method 1 - Standard Chroma Vector Store

In [124]:
# Create documents_df with desired structure
documents_df = []
for index, row in df.iterrows():
  page_content = row["Content"]
  path = row["Path"].replace("./project_old", "").replace("\\", "/")
  source =  path
  from langchain_core.documents.base import Document
  # Create a Document with some text content and optional metadata
  my_document = Document(page_content=page_content, 
                         metadata={"source": source})
  documents_df.append(my_document)


Then, the CharacterTextSplitter class splits the document into smaller text chunks.

In [125]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter (chunk_size=1000, chunk_overlap=30, separator=" ")
docs = text_splitter.split_documents(documents_df)
print(len(docs )) #2 

2


In [126]:
texts

[Document(page_content='# Factory Feature.', metadata={'source': '/README.md'}),
 Document(page_content='import os\n\ndef search_files(directory):\n file_list = []\n for root, dirs, files in os.walk(directory):\n for file in files:\n file_list.append(os.path.join(root, file))\n return file_list\n\ndef save_to_txt(file_list):\n with open("files.txt", "w") as file:\n for file_name in file_list:\n file.write(file_name + "\\n")\n print("File names saved to files.txt")\n\nif __name__ == "__main__":\n directory = "./current_project"\n file_list = search_files(directory)\n save_to_txt(file_list)', metadata={'source': '/src/app.py'})]

The `SentenceTransformerEmbeddings` is a module specifically designed to generate text embeddings using the Sentence Transformer library. It utilizes the all-MiniLM-L6-v2 model, which is a pre-trained model available in the library. This model is lightweight and efficient, making it well-suited for generating embeddings for various languages and tasks.

On the other hand, the `Chroma Vector Store` is a feature that allows you to store vector embeddings. It provides a convenient way to store and retrieve these embeddings.

To use the `Chroma Vector Store`, you need to import it from the `langchain.vectorstores` module. 

In [128]:
from sentence_transformers import SentenceTransformer  # Import the correct class
# Extract text content from texts
texts_content = [doc.page_content for doc in docs]  # List comprehension
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Pass the model name directly
# Generate embeddings for each text
embeddings_sample = model.encode(texts_content)


In [129]:
# Print the generated embedding
print("Text Embedding:", embeddings_sample[0][:10])

Text Embedding: [-0.04426908 -0.0081234   0.04465149 -0.0182588   0.03830957  0.04288102
  0.0580455  -0.00669592 -0.12391116 -0.05695332]


In [130]:
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [131]:
from langchain_community.vectorstores import Chroma
# load it into Chroma
db = Chroma.from_documents(docs, embedding_function )



In this code, `docs` refers to the list of text documents, and `embeddings` represents the corresponding vector embeddings function. The `Chroma` class enables you to create a `db` object, which can be used to store and retrieve the vector embeddings.

In [135]:
db

<langchain_community.vectorstores.chroma.Chroma at 0x1eb9c97b190>

In [136]:
# query it
query = "README.md"
docs_search = db.similarity_search(query)
# print results
print(docs_search[0].page_content)

import os

def search_files(directory):
 file_list = []
 for root, dirs, files in os.walk(directory):
 for file in files:
 file_list.append(os.path.join(root, file))
 return file_list

def save_to_txt(file_list):
 with open("files.txt", "w") as file:
 for file_name in file_list:
 file.write(file_name + "\n")
 print("File names saved to files.txt")

if __name__ == "__main__":
 directory = "./current_project"
 file_list = search_files(directory)
 save_to_txt(file_list)


Extending the previous example, if you want to save to disk, simply initialize the Chroma client and pass the directory where you want the data to be saved to.

In [133]:
# save to disk
db = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")
# load from disk
db2 = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
docs = db2.similarity_search(query)
print(docs[0].page_content)

import os

def search_files(directory):
 file_list = []
 for root, dirs, files in os.walk(directory):
 for file in files:
 file_list.append(os.path.join(root, file))
 return file_list

def save_to_txt(file_list):
 with open("files.txt", "w") as file:
 for file_name in file_list:
 file.write(file_name + "\n")
 print("File names saved to files.txt")

if __name__ == "__main__":
 directory = "./current_project"
 file_list = search_files(directory)
 save_to_txt(file_list)


Caution: Chroma makes a best-effort to automatically save data to disk, however multiple in-memory clients can stop each other’s work. As a best practice, only have one client per path running at any given time.

In [139]:
#help(db)

 **From Texts:**

   - If you have plain text documents, convert them into a list called `texts`.
   - Use `Chroma.from_texts()` to create `db`, specifying `texts` along with an optional `embedding_function`.


**Querying Chroma**

Chroma offers several ways to search for documents based on textual similarity:

1. **Basic Textual Search (Similarity Search):**

   - Provide a query string to `db.similarity_search()`.
   - This method returns the `k` most similar documents (default `k=4`) based on cosine similarity.


In [220]:
query = "README.md"
docs_search = db.similarity_search(query)
# Access document content:
for doc in docs_search:
    print(doc.metadata)  
    print(doc.page_content)


{'source': '/src/app.py'}
import os

def search_files(directory):
 file_list = []
 for root, dirs, files in os.walk(directory):
 for file in files:
 file_list.append(os.path.join(root, file))
 return file_list

def save_to_txt(file_list):
 with open("files.txt", "w") as file:
 for file_name in file_list:
 file.write(file_name + "\n")
 print("File names saved to files.txt")

if __name__ == "__main__":
 directory = "./current_project"
 file_list = search_files(directory)
 save_to_txt(file_list)
{'source': '/src/app.py'}
import os

def search_files(directory):
 file_list = []
 for root, dirs, files in os.walk(directory):
 for file in files:
 file_list.append(os.path.join(root, file))
 return file_list

def save_to_txt(file_list):
 with open("files.txt", "w") as file:
 for file_name in file_list:
 file.write(file_name + "\n")
 print("File names saved to files.txt")

if __name__ == "__main__":
 directory = "./current_project"
 file_list = search_files(directory)
 save_to_txt(file_list)
{'so

2. **Search with Filtering:**

   - Use the `filter` argument in `similarity_search()` to restrict results based on metadata associated with documents. The `filter` argument is a dictionary with metadata field names as keys and desired values as values.


In [221]:
query = "README.md"
filter_criteria = {"id": "0"}  # Search for documents with ID "doc2"
filtered_docs = db.similarity_search(query, filter=filter_criteria)


In [177]:
for doc in filtered_docs:
    print(doc)

3. **Search with Score Threshold:**

   - The `similarity_search_with_relevance_scores()` method retrieves documents along with their similarity scores (cosine distances) between 0 (most similar) and 1 (least similar).
   - Optionally, you can use the `score_threshold` keyword argument to filter results based on a minimum similarity score.


In [219]:
docs_with_scores = db.similarity_search_with_relevance_scores(query, score_threshold=0.7)
for doc, score in docs_with_scores:
    print(doc, score)



4. **Maximal Marginal Relevance (MMR) Search:**

   - The `max_marginal_relevance_search()` method aims to return documents that are both relevant to the query and diverse from each other. Use this when you want a set of documents that covers a broader range of related topics.


In [201]:
mmr_results = db.max_marginal_relevance_search(query, k=2, lambda_mult=0.15)  # More diversity
for doc in mmr_results:
    print(doc)

Number of requested results 20 is greater than number of elements in index 8, updating n_results = 8


page_content='import os\n\ndef search_files(directory):\n file_list = []\n for root, dirs, files in os.walk(directory):\n for file in files:\n file_list.append(os.path.join(root, file))\n return file_list\n\ndef save_to_txt(file_list):\n with open("files.txt", "w") as file:\n for file_name in file_list:\n file.write(file_name + "\\n")\n print("File names saved to files.txt")\n\nif __name__ == "__main__":\n directory = "./current_project"\n file_list = search_files(directory)\n save_to_txt(file_list)' metadata={'source': '/src/app.py'}
page_content='# Factory Feature.' metadata={'source': '/README.md'}


Method 2 - Collection in Chroma DB

In [214]:
## Generation of the document Text
# Extract the text content directly for Chroma's preferred format
documents = []
paths=[]
for index, row in df.iterrows():
    path = row["Path"].replace("./project_old", "").replace("\\", "/")
    text = "Path: " + path + "\nContent:\n " + row["Content"]
    paths.append(path[1:])
    #print(text)
    documents.append(text)
print(documents)    

['Path: /README.md\nContent:\n # Factory Feature.\n\n', 'Path: /src/app.py\nContent:\n import os\n\ndef search_files(directory):\n    file_list = []\n    for root, dirs, files in os.walk(directory):\n        for file in files:\n            file_list.append(os.path.join(root, file))\n    return file_list\n\ndef save_to_txt(file_list):\n    with open("files.txt", "w") as file:\n        for file_name in file_list:\n            file.write(file_name + "\\n")\n    print("File names saved to files.txt")\n\nif __name__ == "__main__":\n    directory = "./current_project"\n    file_list = search_files(directory)\n    save_to_txt(file_list)\n    \n']


In [215]:
paths

['README.md', 'src/app.py']

In [216]:
ids=[str(i) for i in range(len(documents))]

In [217]:
import chromadb
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("collection_name")


In [218]:
collection.add(ids=ids, documents=documents)


Insert of existing embedding ID: 0
Insert of existing embedding ID: 1
Add of existing embedding ID: 0
Add of existing embedding ID: 1


In [192]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [203]:
langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="collection_name",
    embedding_function=embedding_function,
)
print("There are", langchain_chroma._collection.count(), "in the collection")

There are 2 in the collection


In [204]:
langchain_chroma

<langchain_community.vectorstores.chroma.Chroma at 0x1eb9d08e6b0>

In [209]:
query = "README.md"
docs_search = langchain_chroma.similarity_search(query)
# Access document content:
for doc in docs_search:
    print(doc.metadata)  
    print(doc.page_content)
     


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{}
Path: /README.md
Content:
 # Factory Feature.


{}
Path: /src/app.py
Content:
 import os

def search_files(directory):
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list

def save_to_txt(file_list):
    with open("files.txt", "w") as file:
        for file_name in file_list:
            file.write(file_name + "\n")
    print("File names saved to files.txt")

if __name__ == "__main__":
    directory = "./current_project"
    file_list = search_files(directory)
    save_to_txt(file_list)
    



In [210]:
query = "README.md"
filter_criteria = {"id": "0"}  # Search for documents with ID "0"
docs_search = langchain_chroma.similarity_search(query,filter=filter_criteria)
# Access document content:
for doc in docs_search:
    print(doc.metadata)  
    print(doc.page_content)
     

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


In [195]:
chroma_client = chromadb.Client()
collectionname="collection_name"
try:
    print("We create a Colletion:",collectionname)
    collection1 = chroma_client.create_collection(
        name=collectionname)  
except:
    print("We load a Colletion:",collectionname)
    collection1 = chroma_client.get_collection(name=collectionname)
    

We create a Colletion: collection_name


In [198]:
collection1.add(ids=ids, documents=documents)

In [199]:

def create_embeddings(text):
    from langchain_community.embeddings import HuggingFaceEmbeddings
    import numpy as np  # Optional
    # Choose an appropriate model:
    model_name = "sentence-transformers/all-mpnet-base-v2"  # Replace with your desired model if needed
    # Set device (CPU or GPU) based on your hardware and performance requirements:
    model_kwargs = {'device': 'cpu'}  # Change to 'cuda' for GPU usage (if available)
    # Encoding options (normalization is often recommended):
    encode_kwargs = {'normalize_embeddings': True}  # Experiment with normalization
    # Initialize the embedding model:
    hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)    
    embedding = hf.embed_query(text)  # Use embed_query for single text
    return embedding

   

In [5]:
# Indicate the feature prompt that you want to include in the project
feature_request = "Generate a new professional README.md for the repository explaning the content of the application"
embedding_feature = create_embeddings(feature_request)

In [6]:
# Specify model parameters 
model_type = "meta-llama/llama-2-70b-chat"
max_tokens = 300
min_tokens = 100
decoding = DecodingMethods.GREEDY
temperature = 0.7
# Get the LangChain model
model = get_lang_chain_model(model_type, max_tokens, min_tokens, decoding, temperature)
context = collection.query(query_texts=feature_request, n_results=2)
prompt_template = '''
Using the provided context of the project pieces, please generate a new code implementing the requested feature. If you do not know the answer, make a rational decision based on your knowledge.
Context: {context}
Feature Request: {feature_request}
'''
from langchain_core.prompts import PromptTemplate
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "feature_request"])
chain_type_kwargs = {"prompt": PROMPT}

In [None]:
chain = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff", 
    retriever= , 
    chain_type_kwargs=chain_type_kwargs
)
response_text = chain.run(feature_request)

In [18]:
chain_type_kwargs

{'prompt': PromptTemplate(input_variables=['context', 'feature_request'], template='\nUsing the provided context of the project pieces, please generate a new code implementing the requested feature. If you do not know the answer, make a rational decision based on your knowledge.\nContext: {context}\nFeature Request: {feature_request}\n')}