# Extract common question forms in Dolly data

By parsing the Dolly corpus questions we can identify if it's a question, and what form of question it is. Then this can be used to check its assigned category, and the embedding-based classification of the question type. 

jma 30 Aug 2023

In [2]:
# Load json version 
import re, os, sys
import spacy as sp          # A production quality linguistic parser
import numpy as np
import pandas as pd

DATA = '/mnt/512G_hd/repos/dolly_data/databricks-dolly-15k.jsonl'

In [3]:
d_data = pd.read_json(DATA, lines=True)
d_data.columns

Index(['instruction', 'context', 'response', 'category'], dtype='object')

In [4]:
# instruction and category columns to csv
d_data[['instruction', 'category']].to_csv('dolly-15k.csv')

In [5]:
# load the parser for English
english_language = sp.load('en_core_web_trf')


In [16]:
# Parse a sentence.  This give us the raw features to tell if its a question. 

def run_parse(the_sentence):
    phrase = english_language(the_sentence)
    # print([(i, i.label_) for i in phrase.ents]) We dont need the entities. 
    the_parse = []
    for token in phrase:
        # 'Stop' words are the closed classes e.g. pronouns, of words.  Only a small finite number of words make up the class.
        # Stop words  plus the auxilaries and verb give us all the gramatical structure we need.

        if token.is_stop:
            # print(token.text, token.lemma_, end = '\t')
            # print(token.is_stop, token.tag_, token.pos_, token.dep_)
            the_parse.append((token.lemma_, token.tag_, sp.explain(token.tag_), token.pos_, token.dep_))
            # The root is the main verb in the sentence
        elif token.dep_ =='ROOT':
            the_parse.append((token.lemma_, token.tag_, sp.explain(token.tag_), token.pos_, token.dep_))
            # Any words after the main verb are not needed. 
            break  
    return the_parse

run_parse('Will Kyle Van Zyl be playing against when he scored 36 of his teams 61 points?')

[('will', 'MD', 'verb, modal auxiliary', 'AUX', 'aux'),
 ('be', 'VB', 'verb, base form', 'AUX', 'aux'),
 ('play', 'VBG', 'verb, gerund or present participle', 'VERB', 'ROOT'),
 ('against', 'IN', 'conjunction, subordinating or preposition', 'ADP', 'prep'),
 ('when', 'WRB', 'wh-adverb', 'SCONJ', 'advmod'),
 ('he', 'PRP', 'pronoun, personal', 'PRON', 'nsubj'),
 ('of', 'IN', 'conjunction, subordinating or preposition', 'ADP', 'prep'),
 ('his', 'PRP$', 'pronoun, possessive', 'PRON', 'poss')]

In [None]:
# 1st rule -  inversion:  auxiliary followed by noun phase 