In [19]:
import spacy
import json

In [20]:
# Load the English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_trf")

In [21]:
example_questions = json.load(open("example_questions.json", "r"))

In [22]:
# create an empty list to store the keywords for each question
all_keywords = []

# loop through each question in the example_questions list
for question in example_questions:
    # process the sentence
    doc = nlp(question)
    
    # extract keywords by filtering parts of speech
    keywords = [token.text for token in doc if (token.pos_ in ('NOUN', 'PROPN', 'ADJ', 'VERB') and not token.is_stop)]
    
    # append the keywords to the all_keywords list
    all_keywords.append(keywords)

# print the results side by side
for i in range(len(example_questions)):
    print(f"Question: {example_questions[i]}")
    print(f"Keywords: {all_keywords[i]}")
    print()

Question: How loud are air conditioners allowed to be in urban areas in Germany?
Keywords: ['loud', 'air', 'conditioners', 'allowed', 'urban', 'areas', 'Germany']

Question: Who can verify the financial operations of the Nuclear Safety Account?
Keywords: ['verify', 'financial', 'operations', 'Nuclear', 'Safety', 'Account']

Question: Let's say a member state makes a change to an emergency plan regarding the gas supply. What does it need to do? Does it need to notify anyone?
Keywords: ['Let', 'member', 'state', 'makes', 'change', 'emergency', 'plan', 'gas', 'supply', 'need', 'need', 'notify']



In [30]:

def get_query_from_question(question: str) -> list:
    doc = nlp(question)

    # Improved POS tags
    pos_tags = ('NOUN', 'PROPN', 'VERB', 'ADJ')
    relevant_tokens = [token.text for token in doc if token.pos_ in pos_tags and token.text.lower() not in nlp.Defaults.stop_words]

    # Enhanced Dependency Parsing for phrase extraction
    phrases = []
    for token in doc:
        if token.pos_ in ('NOUN', 'PROPN') and token.dep_ in ('nsubj', 'dobj', 'pobj'):
            subtree_span = doc[token.left_edge.i : token.right_edge.i + 1]
            phrase = subtree_span.text
            if len(phrase.split()) > 1:  # Ensuring phrase length is more than one word
                phrases.append(phrase)

    # Named Entities
    entities = [ent.text for ent in doc.ents]

    # Combine and filter
    all_keywords = set(relevant_tokens + phrases + entities)
    query_keywords = [kw for kw in all_keywords if kw.lower() not in nlp.Defaults.stop_words]

    return list(set(query_keywords))

In [31]:
get_query_from_question(example_questions[2])

['an emergency plan',
 'state',
 'supply',
 'need',
 'emergency',
 'gas',
 'notify',
 'member',
 'Let',
 'a member state',
 'change',
 'a change to an emergency plan regarding the gas supply',
 'the gas supply',
 'plan',
 'makes']