File directory

In [None]:
"""
drwxrwxr-x 8 staging_server staging_server 4096 Nov  7 11:23 .
drwxrwxr-x 8 staging_server staging_server 4096 Nov  5 16:38 ..
drwxrwxr-x 8 staging_server staging_server 4096 Nov  6 14:38 .git
-rw-rw-r-- 1 staging_server staging_server  490 Nov  6 09:28 .gitignore
drwxrwxr-x 2 staging_server staging_server 4096 Nov  7 11:23 __pycache__
drwxrwxr-x 3 staging_server staging_server 4096 Nov  5 16:26 bart-large-mnli
drwxrwxr-x 2 staging_server staging_server 4096 Nov  7 12:05 data
drwxrwxr-x 3 staging_server staging_server 4096 Nov  5 16:27 dpr-ctx_encoder-single-nq-base
-rw-rw-r-- 1 staging_server staging_server 2329 Nov  6 10:37 feature_extraction.py
-rw-rw-r-- 1 staging_server staging_server 1659 Nov  6 14:38 get_schema.py
-rw-rw-r-- 1 staging_server staging_server 5823 Nov  6 11:35 main.py
-rw-rw-r-- 1 staging_server staging_server  310 Nov  5 16:20 metadata.json
-rw-rw-r-- 1 staging_server staging_server 1133 Nov  6 12:16 model.py
drwxrwxr-x 3 staging_server staging_server 4096 Nov  5 16:28 nli-deberta-v3-large
-rw-rw-r-- 1 staging_server staging_server 2461 Nov  5 16:28 requirements.txt
"""

Requirements.txt

In [None]:
"""
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.6.77
nvidia-nvtx-cu12==12.1.105
onnxruntime==1.19.2
openai==1.51.1
orjson==3.10.7
packaging==24.1
pandas==1.5.3
peft==0.5.0
pillow==10.4.0
propcache==0.2.0
protobuf==5.28.2
psutil==6.0.0
pyarrow==18.0.0
pyarrow-hotfix==0.6
pydantic==2.9.2
pydantic-settings==2.5.2
pydantic_core==2.23.4
pymilvus==2.4.7
pymongo==4.9.2
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
pytz==2024.2
PyYAML==6.0.2
regex==2023.12.25
requests==2.32.3
requests-toolbelt==1.0.0
rouge_score==0.1.2
safetensors==0.4.5
scikit-learn==1.5.2
scipy==1.14.1
sentence-transformers==3.2.0
sentencepiece==0.2.0
six==1.16.0
sniffio==1.3.1
soupsieve==2.6
SQLAlchemy==2.0.35
sympy==1.13.3
tenacity==8.5.0
threadpoolctl==3.5.0
tokenizers==0.19.1
torch==2.4.1
tqdm==4.66.5
transformers==4.44.2
trec-car-tools==2.6
triton==3.0.0
typing-inspect==0.9.0
typing_extensions==4.12.2
ujson==5.10.0
unlzw3==0.2.2
urllib3==2.2.3
warc3-wet==0.2.5
warc3-wet-clueweb09==0.2.5
xxhash==3.5.0
yarl==1.14.0
zlib-state==0.1.9
"""

model.py

In [None]:
from openai import OpenAI

client = OpenAI(base_url = "http://localhost:8000/v1", api_key = "token")

def model_response(query, mongo_data):
    if not mongo_data:
        return "<start> The data is not present in the database. <end>"

    prompt = f"""    
            Follow the below instructions -
            1) Answer the question in a natural human-like manner. 
            2) Use only the context to answer the question.
            3) The question : {query} 
            4) The context : {mongo_data} 
            5) Respond clearly summarizing the answer in less than 100 words.
            6) Add <start> before the acutal response and <end> after the response.
            """

    completion = client.chat.completions.create(
            model = "Meta-Llama-3.1-8B-Instruct-quantized.w4a16/",
            messages = [
                        {"role": "system", "content": "You are a chatbot who answers the question, do not suggest reponses."},
                        {"role":"user", "content": prompt}
                        ]
            )
    response = completion.choices[0].message.content
    return response

metadata.json

In [None]:
{
        "acceptance_criteria":"acceptance criteria, requirements, specific conditions, clarity", 
        "assumptions":"assumptions, dependencies", 
        "ambiguities":"ambiguities, vague, missing details", 
        "mind_maps":"visualizations, mind map, hierarchy", 
        "test_scenarios":"test scenarios, edge cases, test coverage"
}

main.py

In [None]:
from pymilvus import MilvusClient, model
from pymongo import MongoClient
from tqdm import tqdm
import re, sys
from bson.objectid import ObjectId
from FlagEmbedding import FlagReranker
from rouge_score import rouge_scorer

from feature_extraction import main_feature
from model import model_response


# Initialize MongoDB, Milvus Vector DB, Reranker
mongodb_uri = "mongodb://localhost:27018"
embedding_fn = model.DefaultEmbeddingFunction()
milvus_client = MilvusClient("data/milvus_demo.db")
reranker = FlagReranker('BAAI/bge-reranker-v2-m3', query_max_length=256, passage_max_length=512, use_fp16=True, devices=['cpu'])


# Create the vector DB
def user_stories_db(database_name, collection_name, milvus_collection_name):
    mongo_client = MongoClient(mongodb_uri)
    db = mongo_client[database_name]
    collection = db[collection_name]
    documents = collection.find()

    milvus_client.drop_collection(collection_name=milvus_collection_name)
    milvus_client.create_collection(collection_name=milvus_collection_name, dimension=768)

    story_data, complete_data = [], []

    # Iterate over the MongoDB data
    for doc in tqdm(documents):
        try:
            if "refined" not in doc or not doc["refined"]:
                doc["refined"] = " "
        
            story_data.append(doc["story"]+doc["refined"])
            complete_data.append({"text":doc["story"]+doc["refined"], "id":doc["_id"]})
        except:
            print("Error with schema")
    # Convert the data to vectors
    vectors = embedding_fn.encode_documents(story_data)
    data = [{"id":i, "data_id":complete_data[i]["id"], "vector":vectors[i], "text":complete_data[i]["text"]} for i in range(len(vectors))]
    
    # Store the vector in Milvus
    milvus_client.insert(collection_name = milvus_collection_name, data = data)

def retrieve_user_stories(query, milvus_collection_name, threshold):
    query_vectors = embedding_fn.encode_queries([query])
    results = milvus_client.search(collection_name = milvus_collection_name, params = {"metric_type":"COSINE"}, data=query_vectors, 
            output_fields=["data_id", "text", "id"])    
    
    retrived_documents = []
    mongo_ids = []

    # Get list of Docs and IDs from MongoDB
    for i in range(len(results[0])):
        # Filter data based on Vector similarity score threshold
        if results[0][i]["distance"]>threshold:
            retrived_documents.append(results[0][i]["entity"]["text"])
            mongo_ids.append(results[0][i]["entity"]["data_id"])
    
    #print(retrived_documents)

    # Obtain reranker score
    data = [[query, doc] for doc in retrived_documents]
    rank_scores = reranker.compute_score(data, normalize=True)

    # Obtain rouge score
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(query, doc.lower().replace("user","").replace("story","")) for doc in retrived_documents]
    
    # Filter based on reranker and rouge score
    filtered_mongo_ids = [mongo_ids[idx] for idx, score in enumerate(rank_scores) if score>threshold or rouge_scores[idx]["rougeL"].recall>threshold]
    print("Number of matching user story", len(filtered_mongo_ids))
    return filtered_mongo_ids


def metadata_retrieval(database_name, collection_name, query, mongo_user_story_ids):
    main_app_keys = main_feature(query) #closest_features(query)
    """
    query_keys = ["story"] + main_app_keys
    projection1 = {field: 1 for field in query_keys}
    """

    # Build the basic MongoQuery Dynamically
    projection2 = {}
    for field, subfields in main_app_keys.items():
        if subfields:
            for subfield in subfields:
                projection2[f"{field}.{subfield}"] = 1
        else:
            projection2[f"{field}"] = 1
        projection2["story"] = 1 
        projection2["refined"] = 1
    print("The mongoquery : ", projection2)
    
    # Use the mongo queries on the identified user story IDs
    def match_mongo():
        chunk_data = []
        mongo_client = MongoClient(mongodb_uri)
        db = mongo_client[database_name]
        collection = db[collection_name]
        
        stories = collection.find({ '_id': { '$in':  mongo_user_story_ids} }, projection2)
        

        for i in stories:
            temp_data = ""
            for k in i.keys():
                if k=="_id" or k=="status":
                    continue
                if k == "story":
                    temp_data += f"User story - {i[k]} \t"
                if k=="refined":
                    temp_data += f"Refined User story - {i[k]} \t"
                value = re.sub(r'[^a-zA-Z0-9\s]',"", str(i[k]))
                temp_data += f"{k.upper()} - {value}. \t"
            #print(temp_data,i["_id"])
            chunk_data.append(temp_data)
        
        return chunk_data

    chunks = match_mongo()
    return chunks


def main():
    database_name = "OrgqrS1HZ"
    collection_name = "userStories"
    milvus_collection_name = "temp" 
    check = "T"
    # Get the user story IDs from Mongo DB
    #user_stories_db(database_name, collection_name, milvus_collection_name)
    
    while check=="T":
        query = input("Enter query : ")
        mongo_user_story_ids = retrieve_user_stories(query, milvus_collection_name, 0.2)
        mongo_user_story_ids = list(set(mongo_user_story_ids[:5])) # Take first 5 unique IDs

        # Get keys from metadata base on query
        chunk_data = metadata_retrieval(database_name, collection_name, query, mongo_user_story_ids) 
        #print("The chunked data is :",chunk_data) 

        # Get Model Response
        response = model_response(query, "\n".join(chunk_data))
        print("Model response:\n",response)
        
        check = input("Continue [T/F] : ")


if __name__=="__main__":
    main()

get_schema.py

In [None]:
import json
from collections import deque
from pymongo import MongoClient

# Connect to the MongoDB server
client = MongoClient("mongodb://localhost:27018/")

# Select the database and collection
db = client['OrgqrS1HZ']
collection = db['userStories']
# Retrieve a sample of documents
sample_docs = collection.find_one()


def data_recursion():
    res_values = []

    def extract_keys_bfs(data):
        keys = []
        queue = deque([(data, '')])

        while queue:
            current_dict, parent_key = queue.popleft()
            for key, value in current_dict.items():

                new_key = f"{parent_key}.{key}" if parent_key else key
                keys.append(new_key)
                if isinstance(value, dict):
                    queue.append((value, new_key))
                elif isinstance(value, list) and all(isinstance(i, dict) for i in value):
                    for item in value:
                        queue.append((item, new_key))
                else:
                    res_values.append(f"{new_key} : " + str(value))

        return keys
    
    result = extract_keys_bfs(sample_docs)
    print(result)
    return result


def single_sub_keys():
    with open("metadata.json") as f:
        data = json.load(f)
    metadata_keys = data.keys()
    
    result = {}

    for main_key,v in sample_docs.items():
        if main_key in metadata_keys:
            if type(sample_docs[main_key]) == dict:
                result[main_key] =  list(sample_docs[main_key].keys())
            else:
                result[main_key] = [] 

    return(result)


if __name__=="__main__":
    #data_recursion()
    single_sub_keys()

feature_extraction.py

In [None]:
import json
from transformers import pipeline
from transformers import pipeline, DebertaV2Tokenizer
from get_schema import single_sub_keys
from rouge_score import rouge_scorer


tokenizer = DebertaV2Tokenizer.from_pretrained("nli-deberta-v3-large")
classifier = pipeline("zero-shot-classification", model="nli-deberta-v3-large", tokenizer=tokenizer, device = "cpu")


def obtain_nested_data(query, best_keywords):
    # Obtain 2nd level keys
    nested_data = single_sub_keys()
    best_keywords_with_sub_keys = {}
    for k, v in nested_data.items():
        if k not in best_keywords:
            continue
        if v == []:
            best_keywords_with_sub_keys[k] = []
            continue
        sub_data = classifier(query, v)["labels"][:2]
        best_keywords_with_sub_keys[k] = sub_data
    return best_keywords_with_sub_keys


def using_zero_shot(query):
    keywords, descriptions = [], [] 
    
    # Read values from metadata
    with open("metadata.json") as f:
        metadata = json.load(f)
        reverse_metadata = {values:key for key, values in metadata.items()}
    
    best_descriptions = classifier(query, list(reverse_metadata.keys()))["labels"][:2]
    best_keywords = [reverse_metadata[desc] for desc in best_descriptions]
    return best_keywords


# Use rouge to obtain the required features
def using_rouge_similarity(query):
    print(f"The input query : {query}")
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
     
    # Read values from metadata
    with open("metadata.json") as f:
        metadata = json.load(f)
    modified_keys = {" ".join(val.split("_")):val for val in metadata.keys()}
    best_keys = []

    for k, v in modified_keys.items():
        score = scorer.score(k, query)
        if score["rouge1"].recall + score["rouge2"].recall>=1:
            best_keys.append(v)
    return best_keys


def main_feature(query):
    #query = "give the test scenarios for financial decisions user story."
    best_keywords1 = using_zero_shot(query)
    best_keywords2 = using_rouge_similarity(query)
    best_keywords_with_sub_keys = obtain_nested_data(query, best_keywords1+best_keywords2)
    print("The most improtant features are :",best_keywords_with_sub_keys)
    return best_keywords_with_sub_keys


if __name__=="__main__":
    main()