# Extract common question forms in Dolly data

By parsing the Dolly corpus questions we can identify if it's a question, and what form of question it is. Then this can be used to check its assigned category, and the embedding-based classification of the question type. 

This notebook defines 3 heuristic rules to identify questions - wh_question(), did_question(), is_question() that use the stop word and root from a sentence to classify sentences.
It saves the result in 'misc/dolly-instruction_parse_15k.csv'

jma 30 Aug 2023

In [5]:
# Load json version 
import re, os, sys, pprint
import spacy as sp          # A production quality linguistic parser
import numpy as np
import pandas as pd
import seaborn as sns

DATA = '/mnt/512G_hd/repos/dolly_data/databricks-dolly-15k.jsonl'

In [2]:
d_data = pd.read_json(DATA, lines=True)
d_data.columns

Index(['instruction', 'context', 'response', 'category'], dtype='object')

In [3]:
# load the parser for English
english_language = sp.load('en_core_web_trf')


In [60]:
# Parse a sentence.  This give us the raw features to tell if its a question. 

def run_parse(the_sentence):
    phrase = english_language(the_sentence)
    # print([(i, i.label_) for i in phrase.ents]) We dont need the entities. 
    the_parse = []
    found_root = False
    for k, token in enumerate(phrase):
        # 'Stop' words are the closed classes e.g. pronouns, of words.  Only a small finite number of words make up the class.
        # Stop words  plus the auxilaries and verb give us all the gramatical structure we need.
        if token.dep_ in ('ROOT', 'aux'):
            the_parse.append((token.lemma_, token.tag_, sp.explain(token.tag_), token.pos_, token.dep_, k))
            # One word after after the main verb is needed for inversions.  
            found_root = True
        elif token.is_stop or found_root:
            # print(token.text, token.lemma_, end = '\t')
            # print(token.is_stop, token.tag_, token.pos_, token.dep_)
            the_parse.append((token.lemma_, token.tag_, sp.explain(token.tag_), token.pos_, token.dep_,k))
            # The root is the main verb in the sentence
            # if found_root:
            #     break
    return the_parse

def get_lemma(token_parse):
    return token_parse[0]

def get_tag(token_parse):
    return token_parse[1]

def get_pos(token_parse):
    return token_parse[3] 

def get_dep(token_parse):
    return token_parse[4] 

p = run_parse("Is the bread objective or subjective?.")
print([get_lemma(z) for z in p])
print([get_tag(z) for z in p])
print([get_pos(z) for z in p])
print([get_dep(z) for z in p])

run_parse("Do the places look nice?")

['be', 'the', 'bread', 'objective', 'or', 'subjective', '?', '.']
['VBZ', 'DT', 'NN', 'JJ', 'CC', 'JJ', '.', '.']
['AUX', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'ADJ', 'PUNCT', 'PUNCT']
['ROOT', 'det', 'nsubj', 'acomp', 'cc', 'conj', 'punct', 'punct']


[('do', 'VBP', 'verb, non-3rd person singular present', 'AUX', 'aux', 0),
 ('the', 'DT', 'determiner', 'DET', 'det', 1),
 ('place', 'NNS', 'noun, plural', 'NOUN', 'nsubj', 2),
 ('look', 'VB', 'verb, base form', 'VERB', 'ROOT', 3),
 ('nice',
  'JJ',
  'adjective (English), other noun-modifier (Chinese)',
  'ADJ',
  'acomp',
  4),
 ('?', '.', 'punctuation mark, sentence closer', 'PUNCT', 'punct', 5)]

In [61]:
def full_parse(the_sentence):
    phrase = english_language(the_sentence)
    print('k,\tlemma,\ttag,\tpos,\tdep')
    for k, token in enumerate(phrase):
        print(f'{k},\t{token.lemma_},\t{token.tag_},\t{token.pos_},\t{token.dep_}')

#full_parse("Is the bread objective or subjective?.")
full_parse('Did you go to the best places?')
full_parse("Don't the trees look nice?")

k,	lemma,	tag,	pos,	dep
0,	do,	VBD,	AUX,	aux
1,	you,	PRP,	PRON,	nsubj
2,	go,	VB,	VERB,	ROOT
3,	to,	IN,	ADP,	prep
4,	the,	DT,	DET,	det
5,	good,	JJS,	ADJ,	amod
6,	place,	NNS,	NOUN,	pobj
7,	?,	.,	PUNCT,	punct
k,	lemma,	tag,	pos,	dep
0,	do,	VBP,	AUX,	aux
1,	n't,	RB,	PART,	neg
2,	the,	DT,	DET,	det
3,	tree,	NNS,	NOUN,	nsubj
4,	look,	VB,	VERB,	ROOT
5,	nice,	JJ,	ADJ,	acomp
6,	?,	.,	PUNCT,	punct


In [84]:
# 1st rule -  inversion:  auxiliary followed by noun phase 
def wh_question(the_parse):
    features = None # {'qlabel': None, 'lemmas': ()}
    # Starts with a question word?
    if (get_tag(the_parse[0]) in ('WP', 'WRB', 'WDT')) and (get_pos(the_parse[1]) in ('AUX', 'VERB')):
        features = {'qlabel':'WHQ', 'lemmas': (get_lemma(the_parse[0]), get_lemma(the_parse[1]))}
    return features

print(wh_question(run_parse('What is the currency in use in the Netherlands?')))
print(wh_question(run_parse('Who became king of Holland in 1806?')))
print(wh_question(run_parse('How was the king of Holland in 1806?')))
print(wh_question(run_parse('How tall was the king of Holland in 1806?')))
print(wh_question(run_parse('How does king of Holland make money?')))
print(wh_question(run_parse('How large are your teeth grandma?')))
print(wh_question(run_parse('Which are the best places?')))

{'qlabel': 'WHQ', 'lemmas': ('what', 'be')}
{'qlabel': 'WHQ', 'lemmas': ('who', 'become')}
{'qlabel': 'WHQ', 'lemmas': ('how', 'be')}
{'qlabel': 'WHQ', 'lemmas': ('how', 'be')}
{'qlabel': 'WHQ', 'lemmas': ('how', 'do')}
{'qlabel': 'WHQ', 'lemmas': ('how', 'be')}
{'qlabel': 'WHQ', 'lemmas': ('which', 'be')}


In [83]:
def did_question(the_parse):
    # When inversions are not used (Go you home?) the phrase is prefaced by did to make a question.
    features =  None # {'qlabel': None, 'lemmas': ()}
    if get_lemma(the_parse[0]) == 'do':
        # pprint.pprint(the_parse)
        # Look for a subject
        for p in the_parse[1:]:
            # print(f'p- {p}')
            if (get_dep(p) == 'nsubj') and\
                (get_pos(p) in ('PRON', 'PROPN', 'NOUN')):
                features = {'qlabel':'DOQ', 'lemmas': (get_lemma(the_parse[0]), get_lemma(p)) }
    return features

print(did_question(run_parse('Did you go to the best places?')))
print(did_question(run_parse("Do the places look nice?")))

{'qlabel': 'DOQ', 'lemmas': ('do', 'you')}
{'qlabel': 'DOQ', 'lemmas': ('do', 'place')}


In [82]:
def is_question(the_parse):
    # The verb form 'be' placed before its subject indicates a question
    # pprint.pprint(the_parse)
    features = None  # {'qlabel': None, 'lemmas': ()}
    # Starts with 'be' as the ROOT, followed by it's subject noun. 
    if (get_lemma(the_parse[0]) == 'be') and\
       (get_dep(the_parse[0]) in ('ROOT', 'AUX')):
       # Look for a subject
       for p in the_parse[1:]:
          # print(f'p- {p}')
          if (get_dep(p) == 'nsubj') and\
             (get_pos(p) in ('PRON', 'PROPN', 'NOUN')):
             features = {'qlabel':'QIS', 'lemmas': (get_lemma(the_parse[0]), get_lemma(p)) }
    return features

print(is_question(run_parse("Aren't trees nice?")))
print(is_question(run_parse("Are the trees nice?")))
print(is_question(run_parse("is bread something Korean?")))
print(is_question(run_parse("Would it be better?")))


{'qlabel': 'QIS', 'lemmas': ('be', 'tree')}
{'qlabel': 'QIS', 'lemmas': ('be', 'tree')}
{'qlabel': 'QIS', 'lemmas': ('be', 'bread')}
None


In [88]:
def tst_for_question(the_sentence):
    'test for a question form, and if so, return its features'
    # TODO Extend this to ignore a phrase that prefaces the question. (e.g. the question word is not first.)
    
    p = run_parse(the_sentence)
    if feature := did_question(p):
        pass
    elif feature := is_question(p):
        pass
    elif feature := wh_question(p):
        pass
    else:
        feature = {'qlabel': None, 'lemmas': ()}
    return feature

print(tst_for_question("Aren't trees nice?"))
print(tst_for_question('Is beauty objective or subjective?'))
print(tst_for_question('Did dinosaurs have lips?'))
print(tst_for_question('Who played Billy the Kid in The Left Handed Gun'))
print(tst_for_question('Please summarize what Linkedin does.'))
print(tst_for_question('Give me what you find.'))

{'qlabel': 'QIS', 'lemmas': ('be', 'tree')}
{'qlabel': 'QIS', 'lemmas': ('be', 'beauty')}
{'qlabel': 'DOQ', 'lemmas': ('do', 'dinosaur')}
{'qlabel': 'WHQ', 'lemmas': ('who', 'play')}
{'qlabel': None, 'lemmas': ()}
{'qlabel': None, 'lemmas': ()}


In [91]:
parse = [tst_for_question(instruction) for instruction in d_data.loc[ :, 'instruction']]
p_data = pd.concat([pd.DataFrame(parse), d_data], axis=1)
# p_data.columns = ['parse', 'instruction', 'context', 'response', 'category']
#p_data #.head()

In [94]:
p_data[['qlabel', 'lemmas', 'instruction','category']].to_csv('dolly-instruction_parse_15k.csv', na_rep='NULL')

In [10]:
# To continue with the saved data
p_data = pd.read_csv('dolly-instruction_parse_15k.csv')
p_data.fillna('NQ', inplace=True)                        # Not a question. 
pd.crosstab(p_data['qlabel'], p_data['category'])
# Note that the correlation between category labels and if it is a question is not strong. 

category,brainstorming,classification,closed_qa,creative_writing,general_qa,information_extraction,open_qa,summarization
qlabel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DOQ,4,2,21,11,31,8,34,5
NQ,970,1835,1006,480,459,942,851,570
QIS,22,42,35,11,78,13,72,24
WHQ,770,257,711,207,1623,543,2785,589


In [11]:
set(p_data['qlabel'])

{'DOQ', 'NQ', 'QIS', 'WHQ'}