In [None]:
"main.py"

from pymilvus import MilvusClient, model
from pymongo import MongoClient
from tqdm import tqdm
from feature_extraction import main_feature # closest_features  
import re, sys
from bson.objectid import ObjectId
from model import model_response

mongodb_uri = "mongodb://localhost:27018"
embedding_fn = model.DefaultEmbeddingFunction()
milvus_client = MilvusClient("../data/milvus_demo.db")

def user_stories_db(database_name, collection_name, milvus_collection_name):
    mongo_client = MongoClient(mongodb_uri)
    db = mongo_client[database_name]
    collection = db[collection_name]
    documents = collection.find()

    milvus_client.drop_collection(collection_name=milvus_collection_name)
    milvus_client.create_collection(collection_name=milvus_collection_name, dimension=768)

    story_data, complete_data = [], []

    # Iterate over the MongoDB data
    for doc in tqdm(documents):
        try:
            if "refined" not in doc or not doc["refined"]:
                doc["refined"] = " "
        
            story_data.append(doc["story"]+doc["refined"])
            complete_data.append({"text":doc["story"]+doc["refined"], "id":doc["_id"]})
        except:
            print("Error with schema")
            #sys.exit()
    # Convert the data to vectors
    vectors = embedding_fn.encode_documents(story_data)
    data = [{"id":i, "data_id":complete_data[i]["id"], "vector":vectors[i], "text":complete_data[i]["text"]} for i in range(len(vectors))]
    
    # Store the vector in Milvus
    milvus_client.insert(collection_name = milvus_collection_name, data = data)


def retrieve_user_stories(query, milvus_collection_name, threshold):
    query_vectors = embedding_fn.encode_queries([query])
    results = milvus_client.search(collection_name = milvus_collection_name, params = {"radius":threshold}, data=query_vectors, output_fields=["data_id", "text", "id"])    
    
    mongo_ids = []
    # Get list of IDs from MongoDB
    for i in range(len(results[0])):
        val = results[0][i]["entity"]["data_id"]
        mongo_ids.append(val)
    
    print("Number of matching user story", len(mongo_ids))

    return mongo_ids

def metadata_retrieval(database_name, collection_name, query, mongo_user_story_ids):
    main_app_keys = main_feature(query) #closest_features(query)
    """
    query_keys = ["story"] + main_app_keys
    projection1 = {field: 1 for field in query_keys}
    """

    # Build the basic MongoQuery Dynamically
    projection2 = {}
    for field, subfields in main_app_keys.items():
        if subfields:
            for subfield in subfields:
                projection2[f"{field}.{subfield}"] = 1
        else:
            projection2[f"{field}"] = 1
        projection2["story"] = 1 
        projection2["refined"] = 1
    print("The mongoquery : ", projection2)
    
    # Use the mongo queries on the identified user story IDs
    def match_mongo():
        chunk_data = []
        mongo_client = MongoClient(mongodb_uri)
        db = mongo_client[database_name]
        collection = db[collection_name]
        
        stories = collection.find({ '_id': { '$in':  mongo_user_story_ids} }, projection2)
        

        for i in stories:
            temp_data = ""
            for k in i.keys():
                if k=="_id" or k=="status":
                    continue
                if k == "story":
                    temp_data += f"User story - {i[k]} \t"
                if k=="refined":
                    temp_data += f"Refined User story - {i[k]} \t"
                value = re.sub(r'[^a-zA-Z0-9\s]',"", str(i[k]))
                temp_data += f"{k.upper()} - {value}. \t"
            chunk_data.append(temp_data)
        return chunk_data

    chunks = match_mongo()
    return chunks
def main():
    database_name = "OrgqrS1HZ"
    collection_name = "userStories"
    milvus_collection_name = "temp" 
    check = "T"
    # Get the user story IDs from Mongo DB
    #user_stories_db(database_name, collection_name, milvus_collection_name)
    
    while check=="T":
        query = input("Enter query : ") #what are the functional test scenarios for emails"
        mongo_user_story_ids = retrieve_user_stories(query, milvus_collection_name, 0.2)
        mongo_user_story_ids = list(set(mongo_user_story_ids[:5])) # Take first 5 unique IDs

        # Get keys from metadata base on query
        chunk_data = metadata_retrieval(database_name, collection_name, query, mongo_user_story_ids) 
        #print("The chunked data is :",chunk_data) 

        # Get Model Response
        response = model_response(query, "\n".join(chunk_data))
        print("Model response:\n",response)
        
        check = input("Continue [T/F] : ")


if __name__=="__main__":
    main()



In [None]:
"Auto-Tester"

import pandas as pd

# Function to handle file-based querying and store results in a CSV
def query_and_store_results(input_file, output_csv, database_name, collection_name, milvus_collection_name):
    results = []

    with open(input_file, 'r') as file:
        lines = file.readlines()

        for i, line in enumerate(lines):
            query = line

        # Query the RAG model
        mongo_user_story_ids = retrieve_user_stories(query, milvus_collection_name, 0.2)
        mongo_user_story_ids = list(set(mongo_user_story_ids[:5]))  # Take first 5 unique IDs
        chunk_data = metadata_retrieval(database_name, collection_name, query, mongo_user_story_ids)
        response = model_response(query, "\n".join(chunk_data))
        results.append({'Query': query, 'Response': response})

        # Check if the next line is empty to determine "T" or "F" input
        if i + 1 < len(lines) and lines[i + 1].strip() != "":
            response_t = model_response("T", "\n".join(chunk_data))
            results.append({'Query': "T", 'Response': response_t})
        else:
            response_f = model_response("F", "\n".join(chunk_data))
            results.append({'Query': "F", 'Response': response_f})
            break

    # Convert results to DataFrame and save to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")

In [None]:
"Post test ROUGE Score"

import pandas as pd
from rouge_score import rouge_scorer

# Function to update CSV with ROUGE scores between expected and RAG response and print them
def update_csv_with_rouge_scores(input_csv, output_csv):
    # Load the CSV containing Queries, Responses, and Expected Responses
    df = pd.read_csv(input_csv)
    
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Lists to hold the ROUGE scores
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    # Calculate ROUGE scores for each pair of response and expected response
    for i, row in df.iterrows():
        expected = row['Expected Responses']
        response = row['Response']
        
        # Compute ROUGE scores
        scores = scorer.score(expected, response)
        
        # Append the scores to respective lists
        rouge1 = scores['rouge1'].fmeasure
        rouge2 = scores['rouge2'].fmeasure
        rougeL = scores['rougeL'].fmeasure
        rouge1_scores.append(rouge1)
        rouge2_scores.append(rouge2)
        rougeL_scores.append(rougeL)
        
        # Print the ROUGE scores for this row
        print(f"Row {i+1}: ROUGE-1: {rouge1:.4f}, ROUGE-2: {rouge2:.4f}, ROUGE-L: {rougeL:.4f}")

    # Add the scores as new columns in the DataFrame
    df['ROUGE-1'] = rouge1_scores
    df['ROUGE-2'] = rouge2_scores
    df['ROUGE-L'] = rougeL_scores
    
    # Save the updated DataFrame with ROUGE scores to a new CSV
    df.to_csv(output_csv, index=False)
    print(f"Updated CSV with ROUGE scores saved as {output_csv}")

# Usage
input_csv = 'path/to/updated_results.csv'  # CSV with Query, Response, Expected Responses
output_csv = 'path/to/final_results_with_rouge.csv'
update_csv_with_rouge_scores(input_csv, output_csv)

In [6]:
"model.py"

from openai import OpenAI

client = OpenAI(base_url = "http://localhost:8000/v1", api_key = "token")

def model_response(query, mongo_data):
    prompt = f"""    
            Answer the question in a natural human-like manner. 
            Answer the question : {query} using the given context : {mongo_data} in less than 100 words.
            Summarize the whole answer in a paragraph and do not return unnecessary explainations.
            """

    completion = client.chat.completions.create(
            model = "Meta-Llama-3.1-8B-Instruct-quantized.w4a16/",
            messages = [
                        {"role": "system", "content": "You are a chatbot who answers the exact question"},
                        {"role":"user", "content": prompt}
                        ]
            )
    response = completion.choices[0].message.content
    return response


ModuleNotFoundError: No module named 'openai'

In [8]:
"feature_extraction.py"
import json
from transformers import pipeline
from transformers import pipeline, DebertaV2Tokenizor
from get_schema import single_sub_keys
from rouge_score import rouge_scorer

tokenizer = DebertaV2Tokenizer.from_pretrained("nli-deberta-v3-large")
classifier = pipeline("zero-shot-classification", model="nli-deberta-v3-large", tokenizer=tokenizer, device = "cpu")


def obtain_nested_data(query, best_keywords):
    # Obtain 2nd level keys
    nested_data = single_sub_keys()
    best_keywords_with_sub_keys = {}
    for k, v in nested_data.items():
        if k not in best_keywords:
            continue
        if v == []:
            best_keywords_with_sub_keys[k] = []
            continue
        sub_data = classifier(query, v)["labels"][:2]
        best_keywords_with_sub_keys[k] = sub_data
    return best_keywords_with_sub_keys


def using_zero_shot(query):
    keywords, descriptions = [], [] 
    
    # Read values from metadata
    with open("../metadata.json") as f:
        metadata = json.load(f)
        reverse_metadata = {values:key for key, values in metadata.items()}
    
    best_descriptions = classifier(query, list(reverse_metadata.keys()))["labels"][:2]
    best_keywords = [reverse_metadata[desc] for desc in best_descriptions]
    return best_keywords


def using_rouge_similarity(query):
    print(f"The input query : {query}")
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
     
    # Read values from metadata
    with open("../metadata.json") as f:
        metadata = json.load(f)
    modified_keys = {" ".join(val.split("_")):val for val in metadata.keys()}
    best_keys = []

    for k, v in modified_keys.items():
        score = scorer.score(k, query)
        if score["rouge1"].recall + score["rouge2"].recall>=1:
            best_keys.append(v)
    return best_keys

def main_feature(query):
    #query = "give the test scenarios for financial decisions user story."
    best_keywords1 = using_zero_shot(query)
    best_keywords2 = using_rouge_similarity(query)
    best_keywords_with_sub_keys = obtain_nested_data(query, best_keywords1+best_keywords2)
    print("The most improtant features are :",best_keywords_with_sub_keys)
    return best_keywords_with_sub_keys


if __name__=="__main__":
    main()

Collecting openai
  Downloading openai-1.53.0-py3-none-any.whl.metadata (24 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.7.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.2 kB)
Downloading openai-1.53.0-py3-none-any.whl (387 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.1/387.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading jiter-0.7.0-cp312-cp312-macosx_11_0_arm64.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.8/302.8 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jiter, openai
Successfully installed jiter-0.7.0 openai-1.53.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
"get_schema.py"

import json
from collections import deque
from pymongo import MongoClient

# Connect to the MongoDB server
client = MongoClient("mongodb://localhost:27018/")

# Select the database and collection
db = client['OrgqrS1HZ']
collection = db['userStories']
# Retrieve a sample of documents
sample_docs = collection.find_one()


def data_recursion():
    res_values = []

    def extract_keys_bfs(data):
        keys = []
        queue = deque([(data, '')])

        while queue:
            current_dict, parent_key = queue.popleft()
            for key, value in current_dict.items():

                new_key = f"{parent_key}.{key}" if parent_key else key
                keys.append(new_key)
                if isinstance(value, dict):
                    queue.append((value, new_key))
                elif isinstance(value, list) and all(isinstance(i, dict) for i in value):
                    for item in value:
                        queue.append((item, new_key))
                else:
                    res_values.append(f"{new_key} : " + str(value))

        return keys

    result = extract_keys_bfs(sample_docs)
    return result


def single_sub_keys():
    with open("../metadata.json") as f:
        data = json.load(f)
    metadata_keys = data.keys()
    
    result = {}

    for k,v in sample_docs.items():
        if k in metadata_keys:
            if type(sample_docs[k]) == dict:
                result[k] = list(sample_docs[k].keys())
            else:
                result[k] = [] 
    return(result)


if __name__=="__main__":
    #data_recursion()
    single_sub_keys()