In [1]:
# For reading credentials from the .env file
import os
from dotenv import load_dotenv
import pandas as pd

from langchain.document_loaders import PyPDFLoader, DataFrameLoader
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter

# WML python SDK
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes, DecodingMethods
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM

# Load API credentials from .env file
load_dotenv()
try:
    API_KEY = os.environ.get("API_KEY")
    project_id = os.environ.get("PROJECT_ID")
except KeyError:
    API_KEY = input("Please enter your WML api key (hit enter): ")
    project_id = input("Please enter your project_id (hit enter): ")

credentials = {
    "url": "https://us-south.ml.cloud.ibm.com",
    "apikey": API_KEY
}


def get_model(model_type, max_tokens, min_tokens, decoding, temperature):

    generate_params = {
        GenParams.MAX_NEW_TOKENS: max_tokens,
        GenParams.MIN_NEW_TOKENS: min_tokens,
        GenParams.DECODING_METHOD: decoding,
        GenParams.TEMPERATURE: temperature
    }

    model = Model(
        model_id=model_type,
        params=generate_params,
        credentials=credentials,
        project_id=project_id
    )

    return model
def get_lang_chain_model(model_type, max_tokens, min_tokens, decoding, temperature):
    base_model = get_model(model_type, max_tokens, min_tokens, decoding, temperature)
    langchain_model = WatsonxLLM(model=base_model)
    return langchain_model

In [2]:
import chromadb
import pandas as pd
# Provide the path relative to the dir in which the script is running
file_path = "../data/output.pkl"
# 1. Load the dataframe
df = pd.read_pickle(file_path)
df.insert(0, "ID", df.index.astype(str))

In [3]:
df.head()

Unnamed: 0,ID,Path,Read,Extension,Content
0,0,./project_old\README.md,YES,md,# Factory Feature.\n\n
1,1,./project_old\src\app.py,YES,py,import os\n\ndef search_files(directory):\n ...


TextLoader class loads the content of some text into a list named documents.


In [4]:
from langchain.document_loaders import TextLoader
loader = TextLoader('text.txt')
documents_txt = loader.load()

In [5]:
documents_txt[0]

Document(page_content='Text Information', metadata={'source': 'text.txt'})

In [6]:
type(documents_txt[0])

langchain_core.documents.base.Document

## Method 1 - Standard Chroma Vector Store

In [7]:
# Create documents_df with desired structure
documents_df = []
for index, row in df.iterrows():
  page_content = row["Content"]
  path = row["Path"].replace("./project_old", "").replace("\\", "/")
  source =  path
  text = "Path: " + path + "\nContent:\n " + row["Content"]
  from langchain_core.documents.base import Document
  # Create a Document with some text content and optional metadata
  my_document = Document(page_content=text, 
                         metadata={"source": source})
  documents_df.append(my_document)


Then, the CharacterTextSplitter class splits the document into smaller text chunks.

In [8]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter (chunk_size=1000, chunk_overlap=30, separator=" ")
docs = text_splitter.split_documents(documents_df)
print(len(docs )) #2 

2


In [9]:
docs

[Document(page_content='Path: /README.md\nContent:\n # Factory Feature.', metadata={'source': '/README.md'}),
 Document(page_content='Path: /src/app.py\nContent:\n import os\n\ndef search_files(directory):\n file_list = []\n for root, dirs, files in os.walk(directory):\n for file in files:\n file_list.append(os.path.join(root, file))\n return file_list\n\ndef save_to_txt(file_list):\n with open("files.txt", "w") as file:\n for file_name in file_list:\n file.write(file_name + "\\n")\n print("File names saved to files.txt")\n\nif __name__ == "__main__":\n directory = "./current_project"\n file_list = search_files(directory)\n save_to_txt(file_list)', metadata={'source': '/src/app.py'})]

The `SentenceTransformerEmbeddings` is a module specifically designed to generate text embeddings using the Sentence Transformer library. It utilizes the all-MiniLM-L6-v2 model, which is a pre-trained model available in the library. This model is lightweight and efficient, making it well-suited for generating embeddings for various languages and tasks.

On the other hand, the `Chroma Vector Store` is a feature that allows you to store vector embeddings. It provides a convenient way to store and retrieve these embeddings.

To use the `Chroma Vector Store`, you need to import it from the `langchain.vectorstores` module. 

In [10]:
from sentence_transformers import SentenceTransformer  # Import the correct class
# Extract text content from texts
texts_content = [doc.page_content for doc in docs]  # List comprehension
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Pass the model name directly
# Generate embeddings for each text
embeddings_sample = model.encode(texts_content)


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Print the generated embedding
print("Text Embedding:", embeddings_sample[0][:10])

Text Embedding: [-0.01904181 -0.04634443 -0.11461061 -0.01719481  0.08684908  0.02725298
 -0.04432368  0.02173775 -0.09444111 -0.00227118]


In [12]:
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [13]:
from langchain_community.vectorstores import Chroma
# load it into Chroma
db = Chroma.from_documents(docs, embedding_function )



In this code, `docs` refers to the list of text documents, and `embeddings` represents the corresponding vector embeddings function. The `Chroma` class enables you to create a `db` object, which can be used to store and retrieve the vector embeddings.

In [14]:
db

<langchain_community.vectorstores.chroma.Chroma at 0x241ddd8a3e0>

In [15]:
# query it
query = "README.md"
docs_search = db.similarity_search(query)
# print results
print(docs_search[0].page_content)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


Path: /README.md
Content:
 # Factory Feature.


Extending the previous example, if you want to save to disk, simply initialize the Chroma client and pass the directory where you want the data to be saved to.

In [16]:
# save to disk
db = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")
# load from disk
db2 = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
docs = db2.similarity_search(query)
print(docs[0].page_content)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


Path: /README.md
Content:
 # Factory Feature.


Caution: Chroma makes a best-effort to automatically save data to disk, however multiple in-memory clients can stop each other’s work. As a best practice, only have one client per path running at any given time.

In [239]:
#help(db)

 **From Texts:**

   - If you have plain text documents, convert them into a list called `texts`.
   - Use `Chroma.from_texts()` to create `db`, specifying `texts` along with an optional `embedding_function`.


**Querying Chroma**

Chroma offers several ways to search for documents based on textual similarity:

1. **Basic Textual Search (Similarity Search):**

   - Provide a query string to `db.similarity_search()`.
   - This method returns the `k` most similar documents (default `k=4`) based on cosine similarity.


In [17]:
query = "README.md"
docs_search = db.similarity_search(query)
# Access document content:
for doc in docs_search:
    print(doc.metadata)  
    print(doc.page_content)


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{'source': '/README.md'}
Path: /README.md
Content:
 # Factory Feature.
{'source': '/src/app.py'}
Path: /src/app.py
Content:
 import os

def search_files(directory):
 file_list = []
 for root, dirs, files in os.walk(directory):
 for file in files:
 file_list.append(os.path.join(root, file))
 return file_list

def save_to_txt(file_list):
 with open("files.txt", "w") as file:
 for file_name in file_list:
 file.write(file_name + "\n")
 print("File names saved to files.txt")

if __name__ == "__main__":
 directory = "./current_project"
 file_list = search_files(directory)
 save_to_txt(file_list)


2. **Search with Filtering:**

   - Use the `filter` argument in `similarity_search()` to restrict results based on metadata associated with documents. The `filter` argument is a dictionary with metadata field names as keys and desired values as values.


In [18]:
query = "README.md"
filter_criteria = {"id": "0"}  # Search for documents with ID "doc2"
filtered_docs = db.similarity_search(query, filter=filter_criteria)
for doc in filtered_docs:
    print(doc.page_content)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


3. **Search with Score Threshold:**

   - The `similarity_search_with_relevance_scores()` method retrieves documents along with their similarity scores (cosine distances) between 0 (most similar) and 1 (least similar).
   - Optionally, you can use the `score_threshold` keyword argument to filter results based on a minimum similarity score.


In [19]:
docs_with_scores = db.similarity_search_with_relevance_scores(query, score_threshold=0.4)
for doc, score in docs_with_scores:
    print(doc.page_content, score)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


4. **Maximal Marginal Relevance (MMR) Search:**

   - The `max_marginal_relevance_search()` method aims to return documents that are both relevant to the query and diverse from each other. Use this when you want a set of documents that covers a broader range of related topics.


In [23]:
mmr_results = db.max_marginal_relevance_search(query, k=2, lambda_mult=0.75)  # More diversity
for doc in mmr_results:
    print(doc.page_content)

Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2


Path: /README.md
Content:
 # Factory Feature.
Path: /src/app.py
Content:
 import os

def search_files(directory):
 file_list = []
 for root, dirs, files in os.walk(directory):
 for file in files:
 file_list.append(os.path.join(root, file))
 return file_list

def save_to_txt(file_list):
 with open("files.txt", "w") as file:
 for file_name in file_list:
 file.write(file_name + "\n")
 print("File names saved to files.txt")

if __name__ == "__main__":
 directory = "./current_project"
 file_list = search_files(directory)
 save_to_txt(file_list)


In [39]:
#Method 1
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# load it into Chroma
docsearch  = Chroma.from_documents(docs, embedding_function )
# Indicate the feature prompt that you want to include in the project
feature_request = "Generate a new professional README.md for the repository explaning the content of the application"
qa = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
)


In [40]:
qa.run(feature_request)

" Sure, here's a possible new README.md for the repository:\n\nThis repository contains a web application that demonstrates the use of a factory feature. The application allows users to create, read, update, and delete (CRUD) factory objects.\n\nThe factory feature is implemented using a combination of JavaScript and a backend API. The frontend of the application is built using React, while the backend is built using Node.js and Express.js.\n\nThe application includes the following features:\n\n* A list view that displays all factories in the system\n* A detail view that displays information about a specific factory\n* A form view that allows users to create a new factory\n* A form view that allows users to update an existing factory\n* A delete view that allows users to delete a factory\n\nThe application also includes a search bar that allows users to search for factories by name or description.\n\nI don't know the answer to this question.\n\nPlease select one of the following option

In [95]:
qa = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={"k": 6}),
)

In [96]:
qa.run(feature_request)

" Sure, here's a possible README.md for the repository:\n\nThis repository contains a Python application that demonstrates the use of the Factory Feature pattern. The application allows users to search for files in a given directory and save the file names to a text file.\n\nThe application consists of two main parts: the search function and the save function. The search function uses the os module to iterate through all files in a given directory and its subdirectories, and appends the file names to a list. The save function uses the same list of file names to write the names to a text file.\n\nTo use the application, navigate to the directory where the application is located and run the script with the directory as an argument. For example: python app.py ./current_project. This will search for files in the current_project directory and save the file names to a file called files.txt in the same directory.\n\nThe application is designed to be modular and easy to use, and can be adapted

In [None]:
# Helper function for printing docs


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [87]:
#Method2
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
#!pip install faiss-cpu
retriever = FAISS.from_documents(docs, embedding_function).as_retriever(
    search_kwargs={"k": 20}
)
feature_request = "Generate a new professional README.md for the repository explaning the content of the application"
docs_search = retriever.get_relevant_documents(feature_request)
#pretty_print_docs(docs_search)

In [88]:
chain = RetrievalQA.from_chain_type(
    llm=model, retriever=retriever
)


In [89]:
chain.invoke({"query": feature_request})

{'query': 'Generate a new professional README.md for the repository explaning the content of the application',
 'result': " Sure, here's a possible new README.md for the repository:\n\nThis repository contains a web application that demonstrates the use of a factory feature. The application allows users to create, read, update, and delete (CRUD) factory objects.\n\nThe factory feature is implemented using a combination of JavaScript and a backend API. The frontend of the application is built using React, while the backend is built using Node.js and Express.js.\n\nThe application includes the following features:\n\n* A list view that displays all factories in the system\n* A detail view that displays information about a specific factory\n* A form view that allows users to create a new factory\n* A form view that allows users to update an existing factory\n* A delete view that allows users to delete a factory\n\nThe application also includes a search bar that allows users to search for f

In [30]:
#Method3
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# load it into Chroma
vectorstore   = Chroma.from_documents(docs, embedding_function )


# Prompt
prompt = PromptTemplate.from_template('''
Using the provided context of the project pieces, please generate a new code or text based on the requested feature. If you do not know the answer, make a rational decision based on your knowledge.
Context: {docs}
''')

In [31]:
# Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

We can create a chain with either model by passing in the retrieved docs and a simple prompt.

It formats the prompt template using the input key values provided and passes the formatted string to specified LLM.

In [32]:
chain = {"docs": format_docs} | prompt | model | StrOutputParser()

In [33]:
# Run
feature_request = "Generate a new professional README.md for the repository explaning the content of the application"
docsearch = vectorstore.similarity_search(feature_request)

In [34]:
len(docsearch)

4

In [35]:

docsearch[0]

Document(page_content='Path: /README.md\nContent:\n # Factory Feature.', metadata={'source': '/README.md'})

In [36]:
results=chain.invoke(docsearch)

In [37]:
def display_results(source_string):
  """
  This function takes a string and make  readable way.

  Args:
      source_string: The string containing source code.

  Returns:
      None
  """
  lines = source_string.splitlines(keepends=True)
  for line in lines:
    print(line, end='')

In [38]:
display_results(results)


Path: /src/main/java/com/example/factory/Feature.java
Content:
package com.example.factory;

public class Feature {
    private String name;
    private String description;

    public Feature(String name, String description) {
        this.name = name;
        this.description = description;
    }

    public String getName() {
        return name;
    }

    public String getDescription() {
        return description;
    }
}

Path: /src/main/java/com/example/factory/Factory.java
Content:
package com.example.factory;

import java.util.ArrayList;
import java.util.List;

public class Factory {
    private List<Feature> features = new ArrayList<>();

    public void addFeature(Feature feature) {
        features.add(feature);
    }

    public List<Feature> getFeatures() {
        return features;
    }
}

Path: /src/main/resources/application.properties
Content:
factory.feature.1=My Feature 1
factory.feature.2=My Feature 2
factory.feature.3=My Feature 3

Path: /src/main/resources/feat

In [123]:
def write_source(source_string):
  """
  This function takes a string containing source code and writes each section 
  to a separate file based on the provided path.

  Args:
      source_string: The string containing t source code.

  Returns:
      None
  """
  current_path = None
  for line in source_string.splitlines(keepends=True):
    if line.startswith("Path: "):
      # Extract path and remove leading/trailing whitespace
      current_path = line.strip().split()[1]
      print("current_path", current_path)  
    elif current_path:
      # Write content to the file
      
      print(line)
      #with open(current_path, "w") as f:
      #  f.write(line)



In [124]:

write_source(results)

current_path /src/main/java/com/example/factory/Feature.java
Content:

package com.example.factory;



public class Feature {

    private final String name;

    private final String description;



    public Feature(String name, String description) {

        this.name = name;

        this.description = description;

    }



    public String getName() {

        return name;

    }



    public String getDescription() {

        return description;

    }

}



current_path /src/main/java/com/example/factory/Factory.java
Content:

package com.example.factory;



import java.util.ArrayList;

import java.util.List;



public class Factory {

    private List<Feature> features = new ArrayList<>();



    public void addFeature(Feature feature) {

        features.add(feature);

    }



    public List<Feature> getFeatures() {

        return features;

    }

}



current_path /src/main/java/com/example/factory/App.java
Content:

package com.example.factory;



import java.util.Sc

In [28]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

# Create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Load it into FAISS
vectorstore = FAISS.from_documents(docs, embedding_function).as_retriever(
    search_kwargs={"k": 20}
)

# Prompt
prompt_template = '''
Using the provided context of the project pieces, please generate a new code implementing the requested feature. If you do not know the answer, make a rational decision based on your knowledge.
Context: {docs}
'''

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

prompt = PromptTemplate.from_template(prompt_template)

# Chain
chain = {
    "docs": format_docs
} | prompt | model | StrOutputParser()

# Indicate the feature prompt that you want to include in the project
feature_request = "Generate a new professional README.md for the repository explaining the content of the application"
docsearch = vectorstore.get_relevant_documents(feature_request)
results = chain.invoke(docsearch)


In [29]:
results

'\nPath: /src/factory.py\nContent:\n from app import search_files\n\ndef create_file(name, content):\n file_path = os.path.join("./", name)\n with open(file_path, "w") as file:\n file.write(content)\n return file_path\n\ndef read_file(name):\n file_path = os.path.join("./", name)\n with open(file_path, "r") as file:\n return file.read()\n\ndef update_file(name, content):\n file_path = os.path.join("./", name)\n with open(file_path, "w") as file:\n file.write(content)\n return file_path\n\ndef delete_file(name):\n os.remove(os.path.join("./", name))\n\nPath: /src/__init__.py\nContent:\n from .app import App\n from .factory import Factory\n\nPath: /src/main.py\nContent:\n from factory import Factory\n\nfactory = Factory()\n\n# Calling the create_file function.\nfile_path = factory.create_file("example.txt", "This is an example file.")\nprint(f"File created at {file_path}")\n\n# Calling the read_file function.\nfile'

## WatsonX Queries

In [24]:
# Indicate the feature prompt that you want to include in the project
feature_request = "Generate a new professional README.md for the repository explaning the content of the application"
db_search = db2.similarity_search(feature_request)
context=db_search[0].page_content
retriever = db2.as_retriever()
# Specify model parameters 
model_type = "meta-llama/llama-2-70b-chat"
max_tokens = 300
min_tokens = 100
decoding = DecodingMethods.GREEDY
temperature = 0.7
# Get the LangChain model
model = get_lang_chain_model(model_type, max_tokens, min_tokens, decoding, temperature)
prompt_template = '''
Using the provided context of the project pieces, please generate a new code implementing the requested feature. If you do not know the answer, make a rational decision based on your knowledge.
Context: {context}
'''
from langchain_core.prompts import PromptTemplate
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "feature_request"])
chain_type_kwargs = {"prompt": PROMPT}

chain = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff", 
    retriever=retriever  , 
    chain_type_kwargs=chain_type_kwargs
)
response_text = chain.run(feature_request)


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
  warn_deprecated(
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


In [25]:
response_text

'\nPath: /src/factory.py\nContent:\n from app import search_files\n\ndef create_file(name, content):\n file_path = os.path.join("./", name)\n with open(file_path, "w") as file:\n file.write(content)\n return file_path\n\ndef read_file(name):\n file_path = os.path.join("./", name)\n with open(file_path, "r") as file:\n return file.read()\n\ndef update_file(name, content):\n file_path = os.path.join("./", name)\n with open(file_path, "w") as file:\n file.write(content)\n return file_path\n\ndef delete_file(name):\n os.remove(os.path.join("./", name))\n\nPath: /src/__init__.py\nContent:\n from .app import App\n from .factory import Factory\n\nPath: /src/main.py\nContent:\n from factory import Factory\n\nfactory = Factory()\n\n# Calling the create_file function.\nfile_path = factory.create_file("example.txt", "This is an example file.")\nprint(f"File created at {file_path}")\n\n# Calling the read_file function.\nfile'

In [26]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

# Create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Load it into Chroma
vectorstore = Chroma.from_documents(docs, embedding_function)

# Prompt
prompt_template = '''
Using the provided context of the project pieces, please generate a new document implementing the requested feature. If you do not know the answer, make a rational decision based on your knowledge.
Context: {docs}
'''

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

prompt = PromptTemplate.from_template(prompt_template)

# Chain
chain = {
    "docs": format_docs
} | prompt | model | StrOutputParser()

# Indicate the feature prompt that you want to include in the project
feature_request = "Generate a new professional README.md for the repository explaining the content of the application"
docsearch = vectorstore.similarity_search(feature_request)
results = chain.invoke(docsearch)

In [27]:
results

'\nPath: /src/__init__.py\nContent:\n\nPath: /src/factory.py\nContent:\n from . import app\n\ndef run_app():\n app.run()\n\nif __name__ == "__main__":\n run_app()\n\nPath: /src/main.py\nContent:\n from . import app\n\ndef main():\n app.run()\n\nif __name__ == "__main__":\n main()\n\nPath: /src/app.py\nContent:\n import os\n\ndef search_files(directory):\n file_list = []\n for root, dirs, files in os.walk(directory):\n for file in files:\n file_list.append(os.path.join(root, file))\n return file_list\n\ndef save_to_txt(file_list):\n with open("files.txt", "w") as file:\n for file_name in file_list:\n file.write(file_name + "\\n")\n print("File names saved to files.txt")\n\nif __name__ == "__main__":\n directory = "./current_project"\n file_list = search_files(directory)\n save_to_txt(file_list)\n\nPath: /src/factory.py\nContent:\n from . import app\n\n'

Method 2 - Collection in Chroma DB

In [214]:
## Generation of the document Text
# Extract the text content directly for Chroma's preferred format
documents = []
paths=[]
for index, row in df.iterrows():
    path = row["Path"].replace("./project_old", "").replace("\\", "/")
    text = "Path: " + path + "\nContent:\n " + row["Content"]
    paths.append(path[1:])
    #print(text)
    documents.append(text)
print(documents)    

['Path: /README.md\nContent:\n # Factory Feature.\n\n', 'Path: /src/app.py\nContent:\n import os\n\ndef search_files(directory):\n    file_list = []\n    for root, dirs, files in os.walk(directory):\n        for file in files:\n            file_list.append(os.path.join(root, file))\n    return file_list\n\ndef save_to_txt(file_list):\n    with open("files.txt", "w") as file:\n        for file_name in file_list:\n            file.write(file_name + "\\n")\n    print("File names saved to files.txt")\n\nif __name__ == "__main__":\n    directory = "./current_project"\n    file_list = search_files(directory)\n    save_to_txt(file_list)\n    \n']


In [215]:
paths

['README.md', 'src/app.py']

In [216]:
ids=[str(i) for i in range(len(documents))]

In [217]:
import chromadb
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("collection_name")


In [218]:
collection.add(ids=ids, documents=documents)


Insert of existing embedding ID: 0
Insert of existing embedding ID: 1
Add of existing embedding ID: 0
Add of existing embedding ID: 1


In [192]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [203]:
langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="collection_name",
    embedding_function=embedding_function,
)
print("There are", langchain_chroma._collection.count(), "in the collection")

There are 2 in the collection


In [204]:
langchain_chroma

<langchain_community.vectorstores.chroma.Chroma at 0x1eb9d08e6b0>

In [224]:
query = "Factory Feature"
docs_search = langchain_chroma.similarity_search(query)
# Access document content:
for doc in docs_search:
    print(doc.metadata)  
    print(doc.page_content)
     


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{}
Path: /README.md
Content:
 # Factory Feature.


{}
Path: /src/app.py
Content:
 import os

def search_files(directory):
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list

def save_to_txt(file_list):
    with open("files.txt", "w") as file:
        for file_name in file_list:
            file.write(file_name + "\n")
    print("File names saved to files.txt")

if __name__ == "__main__":
    directory = "./current_project"
    file_list = search_files(directory)
    save_to_txt(file_list)
    



In [225]:
query = "README.md"
filter_criteria = {"id": "0"}  # Search for documents with ID "0"
docs_search = langchain_chroma.similarity_search(query,filter=filter_criteria)
# Access document content:
for doc in docs_search:
    print(doc.metadata)  
    print(doc.page_content)
     

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


In [195]:
chroma_client = chromadb.Client()
collectionname="collection_name"
try:
    print("We create a Colletion:",collectionname)
    collection1 = chroma_client.create_collection(
        name=collectionname)  
except:
    print("We load a Colletion:",collectionname)
    collection1 = chroma_client.get_collection(name=collectionname)
    

We create a Colletion: collection_name


In [198]:
collection1.add(ids=ids, documents=documents)

In [None]:
def create_embeddings(text):
    from langchain_community.embeddings import HuggingFaceEmbeddings
    import numpy as np  # Optional
    # Choose an appropriate model:
    model_name = "sentence-transformers/all-mpnet-base-v2"  # Replace with your desired model if needed
    # Set device (CPU or GPU) based on your hardware and performance requirements:
    model_kwargs = {'device': 'cpu'}  # Change to 'cuda' for GPU usage (if available)
    # Encoding options (normalization is often recommended):
    encode_kwargs = {'normalize_embeddings': True}  # Experiment with normalization
    # Initialize the embedding model:
    hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)    
    embedding = hf.embed_query(text)  # Use embed_query for single text
    return embedding

In [5]:
# Indicate the feature prompt that you want to include in the project
feature_request = "Generate a new professional README.md for the repository explaning the content of the application"
embedding_feature = create_embeddings(feature_request)

In [6]:
# Specify model parameters 
model_type = "meta-llama/llama-2-70b-chat"
max_tokens = 300
min_tokens = 100
decoding = DecodingMethods.GREEDY
temperature = 0.7
# Get the LangChain model
model = get_lang_chain_model(model_type, max_tokens, min_tokens, decoding, temperature)
context = collection.query(query_texts=feature_request, n_results=2)
prompt_template = '''
Using the provided context of the project pieces, please generate a new code implementing the requested feature. If you do not know the answer, make a rational decision based on your knowledge.
Context: {context}
Feature Request: {feature_request}
'''
from langchain_core.prompts import PromptTemplate
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "feature_request"])
chain_type_kwargs = {"prompt": PROMPT}

In [None]:
chain = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff", 
    retriever= , 
    chain_type_kwargs=chain_type_kwargs
)
response_text = chain.run(feature_request)

In [18]:
chain_type_kwargs

{'prompt': PromptTemplate(input_variables=['context', 'feature_request'], template='\nUsing the provided context of the project pieces, please generate a new code implementing the requested feature. If you do not know the answer, make a rational decision based on your knowledge.\nContext: {context}\nFeature Request: {feature_request}\n')}