In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Conversation
import json

In [10]:
model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=2048,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

In [11]:
example_questions = json.load(open("example_questions.json", "r"))

In [12]:
def extract_keywords_prompt(question):
    # Submit the question to the model and ask it to extract keywords

    prompt = "Extract meaningful and relevant keywords from the following question:\n\n"
    prompt += question + "\n\n"
    prompt += "Format the keywords as a comma-separated list and enter them below.\n\n"
    prompt += "Keywords:\n"

    return prompt

In [13]:
extract_keywords_prompt(example_questions[0])

'Extract meaningful and relevant keywords from the following question:\n\nHow loud are air conditioners allowed to be in urban areas in Germany?\n\nFormat the keywords as a comma-separated list and enter them below.\n\nKeywords:\n'

In [14]:
answer = pipe(extract_keywords_prompt(example_questions[0]))[0]["generated_text"]



In [16]:
print(answer)

Extract meaningful and relevant keywords from the following question:

How loud are air conditioners allowed to be in urban areas in Germany?

Format the keywords as a comma-separated list and enter them below.

Keywords:

Leave bekan ehemalområ Fuß größområ gewünsche nicht loud, in allow air Germany conditions För mö hecho areas Fußball urban För frü Bür Ortschaft mö För Fuß Gebiet Fußball Fußball urban area Germany noise urban noise Fußball areas Fußball urban Gebiet Fußball För Stadt Fußball Fußball För Fuß Fuß urban areas För Fußball Fußball Fuß Urban areas ur Fuß Deutschland Fußball Fuß área Fußball Fußball urban Stadt Fuß Bür Fuß Fußball Fuß Bür Fußball Fußball Fuß Orts Fußball Fußball urban Fuß Fußball Fuß urban areas Fußball Fuß Fußball city Fußball Fußball Fuß urban City Fußball Bür Fußball Fuß Fußball Fuß Bür Fuß City Fußball Fußball City urban Fuß Stadt Fuß Bür City Fußball Fuß Fußball Bür Stadt Fußball Bür Fußballür Fuß Bür Bür Stadt Fußball Fuß Stadt urban Football Fuß Sta

# Question to Query (abandoned)

In [19]:
question = "How loud are air conditioners allowed to be in urban areas in Germany?"

In [20]:
prompt = "Write an elasticsearch query for the given question:\n" + question + "\n\nGET"

In [21]:
answer = pipe(prompt)[0]["generated_text"]



In [22]:
print(answer)

Write an elasticsearch query for the given question:
How loud are air conditioners allowed to be in urban areas in Germany?

GET /1/_search?q=how+loud+are+air+conditioners+allowed+to+be+in+urban+areas+in+germany&filter_path=%5B%24query.query.bool.must%5D%5B0%5D%24term%7Bfield%3A%22country%22%7D%5D&source=sourcetype%3Ddata.gov.uk%7Cdc.opendata.ch%7Cwww.europa-eu-united.de%24type%3A(event%2Corganisation)%7Cdwd.de%7Cwol.bz.it%7Cwww.weltansichtskarte.info%7Cwww.worldatlas.com%7Cwww.worldstatesmen.org%7Cwww.worldstatesment.org%7Cstatoids.com%7Cwww.britannica.com%7Ctheodora.com%7Cwww.historicalatlas.org%7Cen.wikipedia.org%7Cwww.loc.gov%7Cwww.nps.gov%7Cwww.usgennet.org%7Cwww.encyclopediadramatica.se%7Claw.wikia.com%7Cwww.facts-about.com%7Cwww.onthisday.com%7Cwww.historychannel.co.uk%7Cwww.historyonthenet.com%7Cwww.history.com%7Cwww.historyworld.net%7Cwww.historyplace.com%7Cwww.historylearningsite.com%7Cwww.britainexpress.com%7Cwww.ancient.eu%7Cwww.historytoday.com%7Cwww.bbc.co.uk%7Cwww.infopl

## Conversation

In [50]:
def create_pre_prompt(question):
  
  return """
    I have a question that I would like to transform into an Elasticsearch query for the EUR-Lex dataset:

    {question}""" + """

    I am using this Elasticsearch mapping:

    ```json
    {
      "properties": {
        "Classification": {
          "properties": {
            "Directory code": {
              "properties": {
                "code": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}},
                "level 1": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}},
                // levels 2-12 omitted for brevity
                "level 13": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}}
              }
            },
            "EUROVOC descriptor": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}},
            "Subject matter": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}}
          }
        },
        "Dates": {
          "properties": {
            "Date of document": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}},
            "Date of effect": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}},
            // similar date fields omitted for brevity
            "Deadline 13": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}}
            // similar deadline fields omitted for brevity
          }
        },
        "Misc": {
          "properties": {
            "Additional information": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}},
            "Addressee": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}}
            // other misc fields omitted for brevity
          }
        },
        "text": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}}
      }
    }
    ```
"""

In [53]:
conversation = Conversation(create_pre_prompt(question))
conversation.add_message({'role': 'assistant', 'content': """
    [INST]
    You are a helpful, efficient and effective assistant that specializes in creating Elasticsearch queries from natural language questions. Given a question, proceed as follows:
    1. Understand the problem and structure it into subproblems if appropriate
    2. Understand the mapping of the available Elasticsearch data and how it relates to the subproblems
    3. Construct queries for each subproblem and report the results as a list of queries in ```json``` format
    [/INST]"""})

In [54]:
answer = pipe(conversation, max_length=1000, num_beams=5, clean_up_tokenization_spaces=True)



In [5]:
test_conversation = Conversation(question)
answer = pipe(test_conversation, max_length=2048)

Both `max_new_tokens` (=2048) and `max_length`(=2048) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


## Divided

In [5]:
model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=2048,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

In [6]:

def generate_prompt(task, information):
    return f"The following is a task for a sophisticated language model: {task}. Here is the information you need to consider: {information}"

def process_input(question):
    # Generate a prompt for understanding the question
    prompt = generate_prompt("Understand and decompose the user's question into subproblems", question)
    return pipe(prompt)[0]["generated_text"]

def construct_query(subproblem):
    # Generate a prompt for constructing the Elasticsearch query
    prompt = generate_prompt("Translate the subproblem into an Elasticsearch query", subproblem)
    return pipe(prompt)[0]["generated_text"]

In [7]:

# Example usage:
question = "How loud are air conditioners allowed to be in urban areas in Germany?"
subproblems = process_input(question)
# Assuming `your_index_mapping` is the actual Elasticsearch index structure you have
queries = [construct_query(subproblem) for subproblem in subproblems]



KeyboardInterrupt: 

In [None]:
subproblems

In [None]:
queries