In [2]:
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
from llms_initialisation import credentials, project_id, greedy_params

llama_70b_model = Model(
    model_id = ModelTypes.LLAMA_2_70B_CHAT,
    params = greedy_params,
    credentials = credentials,
    project_id = project_id
)
llama_70b = WatsonxLLM(llama_70b_model)

flan_ul2_model = Model(
    model_id = ModelTypes.FLAN_UL2,
    params = greedy_params,
    credentials = credentials,
    project_id = project_id
)
flan_ul2 = WatsonxLLM(flan_ul2_model)

granite_chat_model = Model(
    model_id = ModelTypes.GRANITE_13B_CHAT_V2,
    params = greedy_params,
    credentials = credentials,
    project_id = project_id
)
granite_chat = WatsonxLLM(granite_chat_model)

mixtral_model = Model(
    model_id = 'ibm-mistralai/mixtral-8x7b-instruct-v01-q',
    params = greedy_params,
    credentials = credentials,
    project_id = project_id
)
mixtral = WatsonxLLM(mixtral_model)

ValidationError: 1 validation error for OpenAI
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)

In [111]:
models = [flan_ul2, granite_chat, mixtral]
model_names = ["flan_ul2", "granite_chat", "mixtral"]

In [3]:
# user_request = """
# A bank is receiving 100s of complaints a day.
# The data is in CSV format, stored in the AW3 cloud environment.
# They want to inject data into either an algorithm or an LLM, which can categorise the complaints into a list of 50 complaint types.
# Model to categories the root cause issues.
# """
user_request = """
I need a model that will help me summarise customer calls and then categorise them based on the type of complaint. I need multilingual support, especially German and English fluency.
"""

In [34]:
requirements_list = []

In [35]:
language_prompt_template = """ 
From the user input provided, state the language requirements of the user:
Here is the user input: {user_request}.
Only use the information given.
Don't make anything up.
"""

language = "Language: " + flan_ul2.invoke(language_prompt_template.format(user_request=user_request))
requirements_list.append(language)
language

'Language: English'

In [155]:
#### CAN'T GET THIS ONE WORKING GREAT YET, IGNORE

# subtasks_template = """
# As a solution architect specialising in Large Language Model (LLM) workflows, your expertise is invaluable in dissecting user requirements into LLM tasks. 
# Please break down the user's request into a list of key requirements.

# User Needs: {needs}
# Please ensure your response is strictly based on the provided information without any assumptions or additions.

# Tasks to be accomplished:
# """
# summary = mixtral.invoke(subtasks_template.format(needs=user_request, 
#                                                             #  requirements=requirements_list
#                                                              ))
# print("Summary: " + summary)

# for model,name in zip(models, model_names):
#     print(name + ": " + model.invoke(match_template.format(needs=user_request, requirements=requirements_list)))

Summary: 
1. Data Retrieval:
   - Retrieve the CSV data from the AW3 cloud environment.

2. Data Pre-processing:
   - Clean the data as required (remove any irrelevant information, handle missing values, etc.)
   - Format the data into a suitable structure for input into the LLM.

3. Model Training:
   - Train an LLM or select an appropriate algorithm to categorise the complaints into 50 complaint types.

4. Model Testing:
   - Test the model with a subset of the data to ensure it is accurately categorising the complaints.

5. Model Deployment:
   - Deploy the model in a production environment where it can categorise incoming complaints.

6. Monitoring and Maintenance:
   - Monitor the model's performance over time and retrain as necessary to maintain high accuracy.

7. Reporting:
   - Provide regular reports on the model's performance, including accuracy metrics and any necessary improvements.


In [156]:
tasks_involved_prompt_template = """ 
Using the user input provided, state which of the following tasks are needed to achieve the user's goal:
- text generation
- text summarisation
- text classification
- question answering
- retrieval augmented generation
- translation
- code

Use step-by-step reasoning to determine your answer.

Here is the user input: {user_request}.
Only use the information given.
Don't make anything up.
"""

tasks = "Task(s): " + flan_ul2.invoke(tasks_involved_prompt_template.format(user_request=user_request)) 
# requirements_list.append(tasks)
tasks 

'Task(s): text classification'

In [103]:
#These recommendations were generated by GPT; they look right
model_architecture_prompt_template = """ 
You are an expert in matching LLM model architectures to specific tasks.
Use the following matches to output which model to use for this task: {task}.

Text Generation: Decoder-only
Text Summarization: Encoder-Decoder
Text Classification: Encoder-only
Question Answering: Encoder-only
Retrieval Augmented Generation: Encoder-Decoder with External Retrieval
Translation: Encoder-Decoder
Code Generation: Encoder-Decoder

Do not make anything up, only use the information given.
If you don't know, say you don't know
Only state the answer, don't explain your reasoning.

Answer:
"""

#Better than a look up table because string alterations (e.g. Q&A/RAG/Summarization) won't be an issue
architecture = "Model Architecture: " + mixtral.invoke(model_architecture_prompt_template.format(task=tasks))
print(architecture)

requirements_list.append(architecture)

Model Architecture: 
Encoder-only


In [107]:
industry_prompt_template = """ 
From the user input provided, state the industry of the use case:
Here is the user input: {user_request}.
Only use the information given.
Don't make anything up.
"""

industry = "Industry: " + flan_ul2.invoke(industry_prompt_template.format(user_request=user_request))
requirements_list.append(industry)
industry

'Industry: Banking'

In [82]:
latency_prompt_template = """ 
Deduce from the user input whether latency is a critical factor, such as in applications requiring real-time interaction versus batch processing tasks.

Here is the user input: {user_request}.

If you are unsure and need more information, write a query to the user requesting it.
Don't make anything up.
"""

latency = "Latency: " + mixtral.invoke(latency_prompt_template.format(user_request=user_request))
print(latency)

if "?" not in latency:
    requirements_list.append(latency)

Latency: 
Query to user:
"Is there a requirement for real-time interaction with the categorization model, or can it process data in batches?"


In [87]:
function_calling_prompt_template = """ 
Deduce whether or not user requirements provided imply a need for the production of structured outputs, such as JSON or YAML.
Here is the user input: {user_request}.
Only use the information given.
Don't make anything up.
"""

funcCalling = "Function Calling:" + mixtral.invoke(function_calling_prompt_template.format(user_request=user_request))
print(funcCalling)

requirements_list.append(funcCalling)


Function Calling:
The user requirements do not imply a need for the production of structured outputs. The user wants to inject data into an algorithm or a large language model (LLM) to categorize complaints into a list of 50 complaint types. The output of this process would likely be a list or a set of categories, not a structured output like JSON or YAML.


In [101]:
context_length_prompt_template = """ 
You are an expert on data processing.
You must decide from the user input whether the task requires ingesting/ producing lots of text at once or whether it can be done in batches.

Here is the user input: {user_request}.

Your answer will decide whether the LLM used for data processing will require a large context window or not.
Your answer should either state 'not a priority' if it can be done in batches, or 'large context window required' if this is the case.

If it is not clear, you can ask the user a question to clarify.

Only use the information given.
Don't make anything up.

Answer:
"""

contextLength = "Context Length: " + flan_ul2.invoke(context_length_prompt_template.format(user_request=user_request))
print(contextLength)

requirements_list.append(contextLength)

Context Length: not a priority


In [102]:
print(requirements_list)

['Language: English', 'Task(s): text classification', 'Function Calling:\nThe user requirements do not imply a need for the production of structured outputs. The user wants to inject data into an algorithm or a large language model (LLM) to categorize complaints into a list of 50 complaint types. The output of this process would likely be a list or a set of categories, not a structured output like JSON or YAML.', 'Context Length: not a priority']


In [161]:
keyWords_template = """
Identify key-words or phrases that characterise this set of user needs and requirements:
User Needs: {needs}

Please ensure your response is strictly based on the provided information without any assumptions or additions.

Key Words:
"""

# Requirements: {requirements}

key_words = llama_70b.invoke(keyWords_template.format(needs=user_request, 
                                                    # requirements=requirements_list
                                                             ))

print("Key Words: " + key_words)

Key Words: Complaints
CSV
AW3
Algorithm
LLM
Categorize
Root Cause Issues








In [168]:
user_info_dict = {
    "original_request": user_request,
    "requirements": requirements_list,
    "key_words": key_words
}

user_info_dict

{'original_request': '\nA bank is receiving 100s of complaints a day.\nThe data is in CSV format, stored in the AW3 cloud environment.\nThey want to inject data into either an algorithm or an LLM, which can categorise the complaints into a list of 50 complaint types.\nModel to categories the root cause issues.\n',
 'requirements': ['Language: English',
  'Task(s): text classification',
  'Function Calling:\nThe user requirements do not imply a need for the production of structured outputs. The user wants to inject data into an algorithm or a large language model (LLM) to categorize complaints into a list of 50 complaint types. The output of this process would likely be a list or a set of categories, not a structured output like JSON or YAML.',
  'Context Length: not a priority',
  'Model Architecture: \nEncoder-only',
  'Industry: Banking'],
 'key_words': 'Complaints\nCSV\nAW3\nAlgorithm\nLLM\nCategorize\nRoot Cause Issues\n\n\n\n\n\n'}

## Order Bias Checks

In [45]:
context_length_prompt_template_A = """ 
Two LLMs were each tasked with evaluation whether a user query requires handling large amounts of text.
Determine which response is better and explain your reasoning.
Here is the response produced by Llama: {llama_response}.
Here is the response produced by mixtral: {mixtral_response}
Here are the original user requirements: {user_requirements}
"""
context_length_prompt_template_B = """ 
Two LLMs were each tasked with evaluation whether a user query requires handling large amounts of text.
Determine which response is better and explain your reasoning.
Here is the response produced by mixtral: {mixtral_response}
Here is the response produced by Llama: {llama_response}.
Here are the original user requirements: {user_requirements}
"""

response_from_orderA = mixtral.invoke(context_length_prompt_template_A.format(llama_response=llama_response, mixtral_response=mixtral_response, user_requirements=user_request))
response_from_orderB = mixtral.invoke(context_length_prompt_template_B.format(llama_response=llama_response, mixtral_response=mixtral_response, user_requirements=user_request))

The response produced by Llama is better because it is more definitive in its conclusion that handling extensive amounts of text is not necessary. It provides specific reasons for its conclusion, such as the fact that the data is already in a structured format and the task is to categorize the complaints into 50 types, which does not require processing large amounts of unstructured text. The response by mixtral is more speculative and does not provide as much certainty in its conclusion. It suggests that there may be a need for handling extensive amounts of text, but does not provide specific reasons for this conclusion. Therefore, the response by Llama is more helpful in determining whether a user query requires handling large amounts of text.
The best response is the one produced by Llama. The user input does not suggest a need for handling extensive amounts of text to be ingested or produced. The data is already in a structured format (CSV) and the task is to categorize the complain

In [46]:
print(response_from_orderA)

The response produced by Llama is better because it is more definitive in its conclusion that handling extensive amounts of text is not necessary. It provides specific reasons for its conclusion, such as the fact that the data is already in a structured format and the task is to categorize the complaints into 50 types, which does not require processing large amounts of unstructured text. The response by mixtral is more speculative and does not provide as much certainty in its conclusion. It suggests that there may be a need for handling extensive amounts of text, but does not provide specific reasons for this conclusion. Therefore, the response by Llama is more helpful in determining whether a user query requires handling large amounts of text.


In [47]:
print(response_from_orderB)

The best response is the one produced by Llama. The user input does not suggest a need for handling extensive amounts of text to be ingested or produced. The data is already in a structured format (CSV) and the task is to categorize the complaints into 50 types, which does not require processing large amounts of unstructured text. Additionally, the input does not mention any requirements for text generation or output, which further suggests that handling extensive amounts of text is not necessary.

The response produced by mixtral is less clear and more speculative. While it is true that the complaints will need to be categorized into a list of 50 complaint types, it is not necessarily true that this will require handling extensive amounts of text. The fact that the data is in CSV format suggests that it is already structured and does not require text analysis. Therefore, the response produced by mixtral is less accurate and more speculative than the response produced by Llama.


In [49]:
llama_response_from_orderA = llama_70b.invoke(context_length_prompt_template_A.format(llama_response=llama_response, mixtral_response=mixtral_response, user_requirements=user_request))
llama_response_from_orderB = llama_70b.invoke(context_length_prompt_template_B.format(llama_response=llama_response, mixtral_response=mixtral_response, user_requirements=user_request))

In [52]:
print(llama_response_from_orderA)

Answer: The response produced by Llama is better.

Reasoning:

Llama's response accurately assesses the user's requirements and correctly determines that handling large amounts of text is not necessary. The user input clearly states that the data is already in a structured format (CSV) and the task is to categorize the complaints into 50 types, which does not require processing large amounts of unstructured text. Additionally, Llama notes that the input does not mention any requirements for text generation or output, further supporting the conclusion that handling extensive amounts of text is not necessary.

On the other hand, mixtral's response is less accurate. While it acknowledges that the user input does not explicitly mention the need for handling extensive amounts of text, it suggests that there may be a need for it based on the fact that the bank is receiving a large number of complaints and the data is in CSV format. However, this is not sufficient reason to assume that handli

In [53]:
print(llama_response_from_orderB)

Answer:
The response produced by Llama is better.

Reasoning:
Llama's response accurately assesses the user's requirements and correctly determines that handling large amounts of text is not necessary. The data is already in a structured format (CSV), and the task is to categorize the complaints into 50 types, which can be done without processing large amounts of unstructured text. Additionally, the user requirements do not mention any requirements for text generation or output, further supporting Llama's conclusion.

In contrast, Mixtral's response is less accurate. While it acknowledges that the user input does not explicitly mention the need for handling extensive amounts of text, it suggests that there may be a need for it based on the fact that the bank is receiving a large number of complaints and the data is in CSV format. However, this is not sufficient reason to assume that handling large amounts of text is necessary. Furthermore, Mixtral's response does not provide a clear co

In [60]:
user_preferences_prompt_template = """ 
From the user input, determine their position on how they prioritise cost vs performance.
Here is the user input: {user_request}. The solution needs to be cost effective.
Explain your reasoning concisely.
Only use the information given.
"""

mixtral_response = mixtral.invoke(user_preferences_prompt_template.format(user_request=user_request))
llama_response = llama_70b.invoke(user_preferences_prompt_template.format(user_request=user_request))

In [61]:
mixtral_response

"\nThe user prioritises cost over performance.\n\nThe user wants to categorise complaints into a list of 50 complaint types.\nThey want to use a model to categories the root cause issues.\nThe data is in CSV format, stored in the AW3 cloud environment.\nThe solution needs to be cost effective.\n\nThe user can use a pre-trained model, such as BERT, to categorise the complaints.\nThis would be a cost-effective solution, as they would not need to train their own model.\nAdditionally, using a pre-trained model would likely be faster than training their own model, as the pre-trained model has already been trained on a large dataset.\n\nHowever, the user may not get the same level of performance as they would with a custom model.\nThe pre-trained model may not be as accurate in categorising the complaints as a custom model.\nAdditionally, the pre-trained model may not be able to handle the specific format of the user's data, as it is in CSV format and stored in the AW3 cloud environment.\n\n

In [62]:
llama_response

'\nMy reasoning is as follows:\nThe user has specified that the solution needs to be cost-effective, which suggests that they prioritize cost over performance. They also mention that the data is in CSV format and stored in the AW3 cloud environment, which suggests that they are looking for a solution that is easy to implement and does not require a lot of additional resources.\n\nTherefore, I would recommend a solution that uses a simple machine learning algorithm, such as a decision tree or random forest, to categorize the complaints. These algorithms are relatively easy to implement and can be trained on the CSV data without requiring a lot of additional resources. Additionally, they are relatively inexpensive to train and deploy compared to more complex algorithms or LLMs.\n\nIn summary, based on the user input, I would prioritize cost over performance and recommend a simple machine learning algorithm to categorize the complaints in a cost-effective manner.'

In [None]:

- Application to a Specific Request: Apply the above instructions to analyse and convert the given user request into structured requirements. Ensure to adapt the analysis based on the specific details and needs presented in the user's request.
