## Step 1  Creation of the Dataframe of the project
The project example is at the ..src folder

In [10]:
from IPython.display import clear_output

In [11]:
import os
import sys
# Add the utils directory to the system path
sys.path.append(os.path.abspath('../'))
# Now you can import the function
from utils.extractor import display_and_store_directory_content 
# Call the function with the desired path
display_and_store_directory_content('../src') 
clear_output()  # Clears the output after the current line

In [12]:
import pandas as pd
# Replace 'your_file.pkl' with the actual path to your pickle file
file_path = './extraction/src.pkl'
# Load the DataFrame from the pickle file
df = pd.read_pickle(file_path)
df.head()

Unnamed: 0,path,content,readable,extension
0,../src/analysis,,,
1,../src/generation,,,
2,../src/models,,,
3,../src/utils,,,
4,../src/vector_database,,,


In [13]:
# Select rows where 'readable' is not 'N/A'
df_filtered = df[df['readable'] != 'N/A']

In [14]:
# Display the filtered DataFrame
df_filtered.columns


Index(['path', 'content', 'readable', 'extension'], dtype='object')

In [15]:
df_filtered.head()

Unnamed: 0,path,content,readable,extension
6,../src/__init__.py,,YES,py
8,../src/analysis/dependency_resolver.py,import os\n\ndef resolve_dependencies(project_...,YES,py
9,../src/analysis/feature_mapper.py,"def map_features_to_components(project_data, f...",YES,py
10,../src/analysis/project_parser.py,import os\n\ndef parse_project(project_path):\...,YES,py
11,../src/analysis/__init__.py,,YES,py


## Testing LLM Watsonx

In [16]:
# For reading credentials from the .env file
import os
from dotenv import load_dotenv
from langchain_ibm import WatsonxLLM

# Load API credentials from .env file
load_dotenv()

# Fetch credentials or prompt the user if missing
WATSONX_APIKEY = os.getenv("WATSONX_APIKEY")
PROJECT_ID = os.getenv("PROJECT_ID")

if not WATSONX_APIKEY:
    WATSONX_APIKEY = input("WML API key not found in .env. Please enter your WML API key: ").strip()
    print("Reminder: Save your WML API key to the .env file for future use.")
if not PROJECT_ID:
    PROJECT_ID = input("Project ID not found in .env. Please enter your project ID: ").strip()
    print("Reminder: Save your Project ID to the .env file for future use.")

# Watsonx credentials
credentials = {
    "url": "https://eu-gb.ml.cloud.ibm.com",  # Update the URL as required
    "apikey": WATSONX_APIKEY,
    "project_id": PROJECT_ID,
}

# Example parameters for WatsonxLLM
parameters = {
    "max_new_tokens": 100,
    "min_new_tokens": 10,
    "decoding_method": "greedy",
    "temperature": 0.7,
}

# Initialize WatsonxLLM
try:
    watsonx_llm = WatsonxLLM(
        model_id="ibm/granite-13b-instruct-v2",
        url=credentials["url"],
        project_id=credentials["project_id"],
        params=parameters,
    )
except Exception as e:
    raise RuntimeError(f"Error initializing WatsonxLLM: {e}")

# Function to invoke the WatsonxLLM model
def invoke_model(prompt):
    """
    Invokes the WatsonxLLM model with the given prompt.

    :param prompt: The input prompt for the model.
    :return: The response from the model.
    """
    try:
        response = watsonx_llm.invoke(prompt)
        return response
    except Exception as e:
        raise RuntimeError(f"Error during model invocation: {e}")

# Example usage
if __name__ == "__main__":
    # Sample prompt for the model
    prompt = "Who is man's best friend?"
    try:
        # Get the model's response
        result = invoke_model(prompt)
        print(f"Response: {result}")
    except RuntimeError as e:
        # Print any errors encountered
        print(e)


Response: Man's best friend is his dog. Dogs are man's best friend because they are always there for you, they never judge you, and they always want to play. Dogs are also very smart, and they can learn tricks and commands. Dogs are also very good at detecting danger, and they can help keep you safe. Dogs are man's best friend because they are always there for you. 


## Step 2 is the enhacement of the metadata of each element of the project.
We create a column with a small description about the content using llm

In [17]:
import pandas as pd
import os
from dotenv import load_dotenv
from langchain_ibm import WatsonxLLM

# Load API credentials from .env file
load_dotenv()

# Fetch credentials or prompt the user if missing
WATSONX_APIKEY = os.getenv("WATSONX_APIKEY")
PROJECT_ID = os.getenv("PROJECT_ID")

if not WATSONX_APIKEY:
    WATSONX_APIKEY = input("WML API key not found in .env. Please enter your WML API key: ").strip()
    print("Reminder: Save your WML API key to the .env file for future use.")
if not PROJECT_ID:
    PROJECT_ID = input("Project ID not found in .env. Please enter your project ID: ").strip()
    print("Reminder: Save your Project ID to the .env file for future use.")

# Watsonx credentials
credentials = {
    "url": "https://eu-gb.ml.cloud.ibm.com",  # Update the URL as required
    "apikey": WATSONX_APIKEY,
    "project_id": PROJECT_ID,
}

# Initialize WatsonxLLM
parameters = {
    "max_new_tokens": 100,
    "min_new_tokens": 10,
    "decoding_method": "greedy",
    "temperature": 0.7,
}
try:
    watsonx_llm = WatsonxLLM(
        model_id="ibm/granite-13b-instruct-v2",
        url=credentials["url"],
        project_id=credentials["project_id"],
        params=parameters,
    )
except Exception as e:
    raise RuntimeError(f"Error initializing WatsonxLLM: {e}")

# Function to invoke the WatsonxLLM model
def invoke_model(prompt):
    try:
        response = watsonx_llm.invoke(prompt)
        return response
    except Exception as e:
        raise RuntimeError(f"Error during model invocation: {e}")

# Function to generate a description for each text element
def generate_description(df):
    """
    Generates descriptions for each row in the DataFrame by parsing the 'text' column
    and invoking WatsonxLLM for enhanced metadata creation.

    :param df: The DataFrame containing the 'text' column.
    :return: A new DataFrame with an added 'description' column.
    """
    descriptions = []

    for _, row in df.iterrows():
        text = row['text']
        prompt = f"Provide a detailed summary description for the following content: {text}"
        try:
            description = invoke_model(prompt)
            descriptions.append(description)
        except RuntimeError as e:
            print(f"Error generating description for row: {e}")
            descriptions.append("Error generating description")

    # Add the descriptions to the DataFrame
    df['description'] = descriptions
    return df

# Main logic
if __name__ == "__main__":
    # Load the DataFrame
    file_path = './extraction/src.pkl'
    df = pd.read_pickle(file_path)

    # Create the 'text' column
    def create_text_column(row):
        text = f"path: {row['path']} content: {row['content']} readable: {row['readable']} extension: {row['extension']}"
        return text

    df['text'] = df.apply(create_text_column, axis=1)

    # Generate descriptions and save results
    try:
        enhanced_df = generate_description(df)
        enhanced_df.to_pickle("enhanced_dataframe.pkl")
        print("Descriptions generated and saved successfully.")
    except Exception as e:
        print(f"Error: {e}")


Descriptions generated and saved successfully.


In [18]:
enhanced_df.head()

Unnamed: 0,path,content,readable,extension,text,description
0,../src/analysis,,,,path: ../src/analysis content: readable: N/A ...,The filepath ../src/analysis does not exist or...
1,../src/generation,,,,path: ../src/generation content: readable: N/...,The path ../src/generation does not contain an...
2,../src/models,,,,path: ../src/models content: readable: N/A ex...,models: N/A extension: N/A
3,../src/utils,,,,path: ../src/utils content: readable: N/A ext...,license: N/Apath: ../../src/utils
4,../src/vector_database,,,,path: ../src/vector_database content: readabl...,The path ../src/vector_database is readable N/...


## Step 3 Creation of the vector database from the dataframe

In [19]:
import chromadb
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
import torch
import os

# Create the 'text' column
def create_text_column(row):
    text = f"path: {row['path']} content: {row['content']} readable: {row['readable']} extension: {row['extension']}"
    return text

# Define file path and collection name
file_path = "./extraction/src.pkl"
collection_name = "my_vector_collection"

# Load the dataframe
df = pd.read_pickle(file_path)
df['text'] = df.apply(create_text_column, axis=1)
df.insert(0, "ID", df.index.astype(str))  # Add unique ID for each row

# Correct client initialization for ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Use PersistentClient for on-disk storage

# Ensure the collection is deleted if it already exists
if collection_name in [col.name for col in chroma_client.list_collections()]:
    chroma_client.delete_collection(name=collection_name)

# Create a new collection
collection = chroma_client.create_collection(name=collection_name)

def create_embeddings_and_store(df, page_content_column, collection):
    """
    This function generates embeddings from a dataframe and stores them in a Chroma collection.

    Args:
        df (pandas.DataFrame): The dataframe containing text data.
        page_content_column (str): The name of the column containing the text content.
        collection (chromadb.Collection): The Chroma collection to store the embeddings.
    """
    # Configure embedding model
    model_name = "sentence-transformers/all-mpnet-base-v2"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_kwargs = {'device': device}
    encode_kwargs = {'normalize_embeddings': True}

    # Initialize the embedding model
    hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

    # Extract text content and generate embeddings
    documents = df[page_content_column].tolist()
    ids = df["ID"].tolist()

    # Generate embeddings for all documents
    embeddings = hf.embed_documents(documents)  # Batch embedding generation

    # Add documents, embeddings, and IDs to the collection
    collection.add(
        documents=documents,
        ids=ids,
        embeddings=embeddings,
        metadatas=[{"index": i} for i in range(len(documents))]  # Example metadata
    )

    print(f"Successfully added {len(documents)} documents to the collection.")

# Create embeddings and store them in Chroma
create_embeddings_and_store(df, "text", collection)

# Verify stored data
print(f"Collection '{collection_name}' now contains {collection.count()} documents.")


  from .autonotebook import tqdm as notebook_tqdm


Successfully added 48 documents to the collection.
Collection 'my_vector_collection' now contains 48 documents.


In [8]:
def display_first_five_documents(collection):
    """
    Reads the first 5 documents from the given ChromaDB collection and displays them.

    Args:
        collection (chromadb.Collection): The ChromaDB collection to read from.

    Returns:
        None
    """
    # Retrieve all documents from the collection
    results = collection.get()
    
    # Extract documents and IDs
    documents = results.get("documents", [])
    ids = results.get("ids", [])
    metadatas = results.get("metadatas", [])

    # Display the first 5 documents
    print(f"Displaying the first {min(len(documents), 5)} documents:")
    for i in range(min(len(documents), 5)):
        print(f"Document ID: {ids[i]}")
        print(f"Content: {documents[i]}")
        if metadatas:
            print(f"Metadata: {metadatas[i]}")
        print("-" * 80)

# Usage Example
display_first_five_documents(collection)


Displaying the first 5 documents:
Document ID: 0
Content: path: ../src/analysis content:  readable: N/A extension: N/A
Metadata: {'index': 0}
--------------------------------------------------------------------------------
Document ID: 1
Content: path: ../src/generation content:  readable: N/A extension: N/A
Metadata: {'index': 1}
--------------------------------------------------------------------------------
Document ID: 2
Content: path: ../src/models content:  readable: N/A extension: N/A
Metadata: {'index': 2}
--------------------------------------------------------------------------------
Document ID: 3
Content: path: ../src/utils content:  readable: N/A extension: N/A
Metadata: {'index': 3}
--------------------------------------------------------------------------------
Document ID: 4
Content: path: ../src/vector_database content:  readable: N/A extension: N/A
Metadata: {'index': 4}
--------------------------------------------------------------------------------


## Step 4 Inference of the LLM by using the vector Database.

In [20]:
import os
from dotenv import load_dotenv
from langchain.chains import RetrievalQA
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ibm import WatsonxLLM
from chromadb import PersistentClient
import torch

# Load credentials from .env file
load_dotenv()
WATSONX_APIKEY = os.getenv("WATSONX_APIKEY")
PROJECT_ID = os.getenv("PROJECT_ID")

if not WATSONX_APIKEY or not PROJECT_ID:
    raise ValueError("API key or Project ID is missing. Please check your .env file.")

def get_lang_chain_model(model_type, max_tokens, min_tokens, decoding_method, temperature):
    """
    Initializes and returns a WatsonxLLM instance with the specified parameters.
    """
    return WatsonxLLM(
        model_id=model_type,
        url="https://eu-gb.ml.cloud.ibm.com",
        project_id=PROJECT_ID,
        params={
            "max_new_tokens": max_tokens,
            "min_new_tokens": min_tokens,
            "decoding_method": decoding_method,
            "temperature": temperature,
        },
    )

def answer_questions_from_dataframe(question, collection_name, persist_directory="./chroma_db"):
    """
    Answers a question using a LangChain model and retrieves relevant documents from a Chroma collection.
    """
    # Specify model parameters
    model_type = "meta-llama/llama-3-1-70b-instruct"
    max_tokens = 300
    min_tokens = 100
    decoding_method = "greedy"
    temperature = 0.7

    # Initialize the WatsonxLLM model
    model = get_lang_chain_model(model_type, max_tokens, min_tokens, decoding_method, temperature)

    # Use the same embedding model as in create_embeddings_and_store for consistency
    model_name = "sentence-transformers/all-mpnet-base-v2"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_kwargs = {'device': device}
    encode_kwargs = {'normalize_embeddings': True}

    # Initialize the embedding model
    hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

    # Initialize Chroma Persistent Client
    chroma_client = PersistentClient(path=persist_directory)

    # Ensure the collection exists
    if collection_name not in [col.name for col in chroma_client.list_collections()]:
        raise ValueError(f"Collection '{collection_name}' does not exist. Make sure it is created.")
    collection = chroma_client.get_collection(name=collection_name)

    # Use Chroma vectorstore with the given collection
    vector_store = Chroma(
        collection_name=collection_name,
        embedding_function=hf,
        client=chroma_client,
    )

    # Create a retriever from the vectorstore
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})

    # Build the RetrievalQA chain
    chain = RetrievalQA.from_chain_type(
        llm=model,
        chain_type="stuff",
        retriever=retriever,
        input_key="question"
    )

    # Run the chain with the question
    response_text = chain.invoke({"question": question})

    print("--------------------------------- Generated response -----------------------------------")
    print(response_text)
    print("*********************************************************************************************")

    return response_text

# Example Usage
question = "How to integrate the specified features into the project components?"

# Ensure consistent settings
persist_directory = "./chroma_db"
collection_name = "my_vector_collection"

response = answer_questions_from_dataframe(question, collection_name, persist_directory)
print(response)


--------------------------------- Generated response -----------------------------------
{'question': 'How to integrate the specified features into the project components?', 'result': ' You can use the `integrate_features` function from the `feature_integration.py` module, passing in the list of project components and the feature instructions as arguments. For example: `integrate_features(project_components=["component1.py", "component2.py"], feature_instructions="import feature_module\\nfeature_module.do_something()")`. This will append the feature instructions to the end of each project component file. \n\nNote: The `integrate_features` function does not return any value, it modifies the project components in-place. \n\nAlso, you can use the `map_features_to_components` function from the `feature_mapper.py` module to map feature requests to specific project components before integrating the features. \n\nFor example: `component_mapping = map_features_to_components(project_data=[{"pat