# Extract common question forms in Dolly data

By parsing the Dolly corpus questions we can identify if it's a question, and what form of question it is. Then this can be used to check its assigned category, and the embedding-based classification of the question type. 

jma 30 Aug 2023

In [1]:
# Load json version 
import re, os, sys
import spacy as sp          # A production quality linguistic parser
import numpy as np
import pandas as pd

DATA = '/mnt/512G_hd/repos/dolly_data/databricks-dolly-15k.jsonl'

In [2]:
d_data = pd.read_json(DATA, lines=True)
d_data.columns

Index(['instruction', 'context', 'response', 'category'], dtype='object')

In [3]:
# load the parser for English
english_language = sp.load('en_core_web_trf')


In [10]:
# Parse a sentence.  This give us the raw features to tell if its a question. 

def run_parse(the_sentence):
    phrase = english_language(the_sentence)
    # print([(i, i.label_) for i in phrase.ents]) We dont need the entities. 
    the_parse = []
    found_root = False
    for k, token in enumerate(phrase):
        # 'Stop' words are the closed classes e.g. pronouns, of words.  Only a small finite number of words make up the class.
        # Stop words  plus the auxilaries and verb give us all the gramatical structure we need.
        if token.dep_ =='ROOT':
            the_parse.append((token.lemma_, token.tag_, sp.explain(token.tag_), token.pos_, token.dep_, k))
            # One word after after the main verb is needed for inversions.  
            found_root = True
        elif token.is_stop or found_root:
            # print(token.text, token.lemma_, end = '\t')
            # print(token.is_stop, token.tag_, token.pos_, token.dep_)
            the_parse.append((token.lemma_, token.tag_, sp.explain(token.tag_), token.pos_, token.dep_,k))
            # The root is the main verb in the sentence
            # if found_root:
            #     break
    return the_parse

def get_lemma(token_parse):
    return token_parse[0]

def get_tag(token_parse):
    return token_parse[1]

def get_pos(token_parse):
    return token_parse[3] 

def get_dep(token_parse):
    return token_parse[4] 

p = run_parse("Is the bread objective or subjective?.")
print([get_lemma(z) for z in p])
print([get_tag(z) for z in p])
print([get_pos(z) for z in p])
print([get_dep(z) for z in p])

p

['be', 'the']
['VBZ', 'DT']
['AUX', 'DET']
['ROOT', 'det']


[('be', 'VBZ', 'verb, 3rd person singular present', 'AUX', 'ROOT', 0),
 ('the', 'DT', 'determiner', 'DET', 'det', 1)]

In [15]:
def full_parse(the_sentence):
    phrase = english_language(the_sentence)
    print('k,\tlemma,\ttag,\tpos,\tdep')
    for k, token in enumerate(phrase):
        print(f'{k},\t{token.lemma_},\t{token.tag_},\t{token.pos_},\t{token.dep_}')

full_parse("Is the bread objective or subjective?.")

k,	lemma,	tag,	pos,	dep
0,	be,	VBZ,	AUX,	ROOT
1,	the,	DT,	DET,	det
2,	bread,	NN,	NOUN,	nsubj
3,	objective,	JJ,	ADJ,	acomp
4,	or,	CC,	CCONJ,	cc
5,	subjective,	JJ,	ADJ,	conj
6,	?,	.,	PUNCT,	punct
7,	.,	.,	PUNCT,	punct


In [45]:
# 1st rule -  inversion:  auxiliary followed by noun phase 
def wh_question(the_parse):
    features = None
    # Starts with a question word?
    if (get_tag(the_parse[0]) in ('WP', 'WRB', 'WDT')) and (get_pos(the_parse[1]) in ('AUX', 'VERB')):
        features = ('WHQ', get_tag(the_parse[0]), get_pos(the_parse[1]))
    return features

print(wh_question(run_parse('What is the currency in use in the Netherlands?')))
print(wh_question(run_parse('Who became king of Holland in 1806?')))
print(wh_question(run_parse('How was the king of Holland in 1806?')))
print(wh_question(run_parse('How tall was the king of Holland in 1806?')))
print(wh_question(run_parse('How does king of Holland make money?')))
print(wh_question(run_parse('How large are your teeth grandma?')))
print(wh_question(run_parse('Which are the best places?')))

('WHQ', 'WP', 'AUX')
('WHQ', 'WP', 'VERB')
('WHQ', 'WRB', 'AUX')
('WHQ', 'WRB', 'AUX')
('WHQ', 'WRB', 'AUX')
('WHQ', 'WRB', 'AUX')
('WHQ', 'WDT', 'AUX')


In [57]:
def did_question(the_parse):
    # When inversions are not used (Go you home?) the phrase is prefaced by did to make a question.
    features = None
    if get_lemma(the_parse[0]) == 'do':
        features = ('DO', get_tag(the_parse[0]))
    return features

print(did_question(run_parse('Did you go to the best places?')))
print(did_question(run_parse("Don't the trees look nice?")))

('DO', 'VBD')
('DO', 'VBP')


In [20]:
def is_question(the_parse):
    # The verb form 'be' placed before its subject indicates a question
    print(the_parse)
    features = None
    # Starts with 'be' as the ROOT, followed by it's subject noun. 
    if (get_lemma(the_parse[0]) == 'be') and\
       (get_dep(the_parse[0]) == 'ROOT'):
       # Look for a subject
       for p in the_parse[1:]:
          if (get_dep(p) == 'nsubj') and\
             (get_pos(p) in ('PRON', 'PROPN', 'NOUN')):
             features = ('QIS',get_lemma(the_parse[0]), get_lemma(the_parse[1]) )
    return features

# print(is_question(run_parse("Are trees nice?")))
print(is_question(run_parse("Are the he looking nice?")))
# print(is_question(run_parse("is bread something Korean?")))
# print(is_question(run_parse("Would it be better?")))


[('be', 'VBP', 'verb, non-3rd person singular present', 'AUX', 'aux', 0), ('the', 'DT', 'determiner', 'PRON', 'dep', 1), ('he', 'PRP', 'pronoun, personal', 'PRON', 'nsubj', 2), ('look', 'VBG', 'verb, gerund or present participle', 'VERB', 'ROOT', 3), ('nice', 'JJ', 'adjective (English), other noun-modifier (Chinese)', 'ADJ', 'acomp', 4)]
None


In [98]:
def tst_for_question(the_sentence):
    'test for a question form, and if so, return its features'
    # TODO Extend this to ignore a phrase that prefaces the question. (e.g. the question word is not first.)
    feature = None
    p = run_parse(the_sentence)
    if feature := did_question(p):
        pass
    elif feature := inversion_question(p):
        pass
    elif feature := wh_question(p):
        pass
    return feature

print(is_question('Is beauty objective or subjective?'))
print(is_question('Did dinosaurs have lips?'))
print(is_question('Who played Billy the Kid in The Left Handed Gun'))
print(is_question('Please summarize what Linkedin does.'))
print(is_question('Give me what you find.'))

('INV', 'AUX', 'ROOT')
('DO', 'VBD')
('WHQ', 'WP', 'VERB')
None
('INV', 'VERB', 'ROOT')


In [91]:
parse = [is_question(instruction) for instruction in d_data.loc[:, 'instruction']]
p_data = pd.concat([pd.Series(parse), d_data], axis=1)
p_data.columns = ['parse', 'instruction', 'context', 'response', 'category']
p_data.head()

Unnamed: 0,0,instruction,context,response,category
0,"(WHQ, WRB, AUX)",When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,"(WHQ, WDT, AUX)",Which is a species of fish? Tope or Rope,,Tope,classification
2,"(WHQ, WRB, AUX)",Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,"(WHQ, WRB, AUX)",When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa
...,...,...,...,...,...
15006,"(WHQ, WRB, AUX)",How do i accept the change,,Embrace the change and see the difference,brainstorming
15007,"(WHQ, WP, AUX)",What is a laser and who created it?,A laser is a device that emits light through a...,A laser is a device that emits light from an e...,summarization
15008,"(WHQ, WP, AUX)",What is the difference between a road bike and...,,Road bikes are built to be ridden on asphalt a...,open_qa
15009,"(WHQ, WRB, AUX)",How does GIS help in the real estate investmen...,,"Real estate investors depend on precise, accur...",general_qa


In [97]:
p_data[['parse', 'instruction','category']].to_csv('dolly-instruction_parse_15k.csv', na_rep='NULL')