### Create an assistant

In [9]:
from openai import OpenAI
import os


assistant_name = 'sem-tab'
assistant_instruction = '....'
path = 'path'
vector_store_name = 'sem-tab-input'
key = os.environ.get("OPENAI_API_KEY")

client = OpenAI(api_key=key)
assistant = client.beta.assistants.create(
    name = assistant_name,
    instructions=assistant_instruction,
    tools=[{"type": "file_search"}],
    model="gpt-4-turbo",
    temperature=0.2,
)

### Upload files and add them to a Vector Store

In [10]:
import os

def get_file_paths(folder_path):
    file_paths = []
    # Iterate through all files in the folder
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            # Get the absolute path of the file
            file_path = os.path.join(root, file_name)
            # Append the file path to the list
            file_paths.append(file_path)
    return file_paths

# create a Vector Store
vector_store = client.beta.vector_stores.create(name=vector_store_name)

folder = path
file_paths = get_file_paths(folder)
file_streams = [open(path, 'rb') for path in file_paths]
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
    vector_store_id=vector_store.id,
    files=file_streams
)

print(file_batch.status)
print(file_batch.file_counts)

completed
FileCounts(cancelled=0, completed=15, failed=0, in_progress=0, total=15)


### Update the assistant to use the new Vector Store

In [11]:
assistant = client.beta.assistants.update(
    assistant_id=assistant.id,
    tools=[{"type": "file_search"}],
    tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

### Create a thread

In [12]:
thread = client.beta.threads.create()

### Add a message to the thread


#### define get responce function

In [13]:
def get_response(query,client,assistant,thread):
    # add message to the thread
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=query
    )

    # create a run
    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread.id,
        assistant_id=assistant.id
    )

    # get messages
    messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
    message_content = messages[0].content[0].text
    # print(f'message_content: {message_content}')
    annotations = message_content.annotations
    citations = []
    for index, annotation in enumerate(annotations):
        message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
        if file_citation := getattr(annotation, "file_citation", None):
            cited_file = client.files.retrieve(file_citation.file_id)
            citations.append(f"[{index}] {cited_file.filename}")

    # print(message_content.value)
    # print("\n".join(citations))
    return message_content.value,citations

#### query 1

In [14]:
query = "what is requirments engineering"
response = get_response(query,client,assistant,thread)
print("************")
print(f"query: {query}")
print(f'response: {response[0]}')
print(f"reference: {response[1]}")

************
query: what is requirments engineering
response: Requirements Engineering (RE) is a discipline within software engineering and systems engineering that focuses on determining the needs or conditions to meet for a new or altered product, taking account of the possibly conflicting requirements of the various stakeholders, such as beneficiaries or users. RE involves various activities such as requirements elicitation, requirements analysis, requirements specification, requirements validation, and requirements management. The goal is to produce a comprehensive and detailed set of requirements for the system that can serve as a basis for subsequent stages of product development, ensuring that the final product meets the needs of its users and stakeholders.
reference: []


In [15]:
query = "what is requirments engineering"
response = get_response(query,client,assistant,thread)
print("************")
print(f"query: {query}")
print(f'response: {response[0]}')
print(f"reference: {response[1]}")

************
query: what is requirments engineering
response: Requirements Engineering (RE) is a crucial phase in the development of systems and software projects, focusing on identifying, documenting, and maintaining a set of requirements. It involves several key activities:

1. **Requirements Elicitation**: Gathering requirements from stakeholders, users, and other sources.
2. **Requirements Analysis**: Analyzing the requirements for feasibility, clarity, and conflicts.
3. **Requirements Specification**: Documenting the requirements in a detailed, clear, and precise manner.
4. **Requirements Validation**: Ensuring the requirements accurately reflect the needs of stakeholders and are consistent.
5. **Requirements Management**: Managing changes to requirements as the system evolves.

The main goal of Requirements Engineering is to ensure that the system or software developed meets the needs of users and stakeholders, aligns with business objectives, and is feasible within technological

#### query 2

In [12]:
query = "Now you are developing an knowledge graph about the state and evolution of the empirical research in Requirements Engineering. Derive 77 competency questions."
response = get_response(query,client,assistant,thread)
print(f"query: {query}")
print(f'response: {response[0]}')
print(f"reference: {response[1]}")

message_content: Text(annotations=[], value="To develop a comprehensive set of competency questions for a knowledge graph on empirical research in Requirements Engineering, these questions should encompass various aspects of the field, including methodologies, findings, trends, and contributions from scholars. Here's a list of 70 competency questions that cover these aspects:\n\n1. What are the key methodologies used in empirical research in Requirements Engineering?\n2. How have the research methodologies in Requirements Engineering evolved over the last decade?\n3. What are the most cited papers in Requirements Engineering?\n4. Which researchers have contributed significantly to Requirements Engineering?\n5. What universities are known for their research in Requirements Engineering?\n6. What funding sources commonly support Requirements Engineering research?\n7. How do empirical results influence the practices in Requirements Engineering?\n8. What software tools are frequently used i

In [13]:
query = "Now you are developing an knowledge graph about the state and evolution of the empirical research in Requirements Engineering. Derive 77 competency questions."
response = get_response(query,client,assistant,thread)
print(f"query: {query}")
print(f'response: {response[0]}')
print(f"reference: {response[1]}")

message_content: Text(annotations=[], value="Creating a well-rounded set of 77 competency questions for a knowledge graph on empirical research in Requirements Engineering involves covering aspects like methodologies, historical evolution, contributions from academic and industrial circles, tools, challenges, and future directions. Here's an expanded list from the previous 70 to 77 questions:\n\n1. What methodologies dominate empirical research in Requirements Engineering?\n2. How has the evolution of methodologies in Requirements Engineering been documented over the past decades?\n3. What are the landmark papers in Requirements Engineering?\n4. Which researchers have made the most significant contributions to the field of Requirements Engineering?\n5. Which academic institutions are leaders in Requirements Engineering research?\n6. What organizations or bodies fund Requirements Engineering research?\n7. How do empirical results impact the methodologies and practices in Requirements En

### Use Sentence Transformers to compare CQs similarity

In [60]:
# !pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util
import pandas as pd

model = SentenceTransformer("all-MiniLM-L6-v2")

data = pd.read_csv('/Users/sherry/python-coding/Prompting/requirement-engeneering/genCQs-expertCQs.csv', names=['expertCQs', 'genCQs'], header=0)

genCQs = list(data['genCQs'])
expertCQs = list(data['expertCQs'])

genCQs_embeddings = model.encode(genCQs)
expertCQs_embeddings = model.encode(expertCQs)

# Compute cosine similarity between all pairs
cos_sim = util.cos_sim(genCQs_embeddings, expertCQs_embeddings)



In [62]:
# Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for genCQs_idx in range(77):
    for expertCQs_idx in range(77):
        all_sentence_combinations.append([cos_sim[genCQs_idx][expertCQs_idx], genCQs_idx, expertCQs_idx])

# Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, genCQs_idx, expertCQs_idx in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(genCQs[genCQs_idx], expertCQs[expertCQs_idx], cos_sim[genCQs_idx][expertCQs_idx]))

Top-5 most similar pairs:
What methodologies dominate empirical research in Requirements Engineering? 	 How do the authors justify the selection and combination of empirical methods and design elements? 	 0.5918
How do empirical studies address the issue of scope creep in software projects? 	 How many empirical studies are by authors working for large software development companies? 	 0.5772
Which case studies have significantly influenced Requirements Engineering practices? 	 How have the proportions of case studies and action research in the empirical methods used evolved over time? 	 0.5704
How do empirical results impact the methodologies and practices in Requirements Engineering? 	 How do the authors justify the selection and combination of empirical methods and design elements? 	 0.5698
What methodologies dominate empirical research in Requirements Engineering? 	 How have the proportions of case studies and action research in the empirical methods used evolved over time? 	 0.5641

### Save cosine similarity score and the corresponding CQ pairs to a csv file

In [64]:
# save cosine similarity score and the corresponding CQ pairs to a csv file
import torch

genCQ_ls = []
expertCQ_ls = []
score_ls = []
n = len(all_sentence_combinations)
for score, genCQs_idx, expertCQs_idx in all_sentence_combinations[0:n]:
    # print("{} \t {} \t {:.4f}".format(genCQs[genCQs_idx], expertCQs[expertCQs_idx], cos_sim[genCQs_idx][expertCQs_idx]))
    # print(f"{score.item():.4f}")
    # print(genCQs[genCQs_idx])
    # print(expertCQs[expertCQs_idx])
    genCQ_ls.append(genCQs[genCQs_idx])
    expertCQ_ls.append(expertCQs[expertCQs_idx])
    score_ls.append(f"{score.item():.4f}")
cos_sim_df = pd.DataFrame()
cos_sim_df['genCQ'] = genCQ_ls
cos_sim_df['expertCQ'] = expertCQ_ls
cos_sim_df['cos_score'] = score_ls

cos_sim_df.to_csv("/Users/sherry/python-coding/Prompting/requirement-engeneering/gen-expert-CQs-cos-more-reference.csv")