# Feature Extraction

Now that we know that the sentences in procedures are imperatives, have verbs in the form of infinitives and gerunds; let's derive features that characterize them.

In [1]:
import pandas as pd
import os
import spacy

nlp = spacy.load('en_core_web_sm')

In [3]:
DATA_PATH = 'data'
train_df = pd.read_csv(os.path.join(DATA_PATH, 'procedure_train_data.csv'), encoding='utf-8')
test_df = pd.read_csv(os.path.join(DATA_PATH, 'procedure_train_data.csv'), encoding='utf-8')

## Example for an imperative

In [4]:
doc = nlp("choose tools - autocorrect , and ensure that while typing is selected.")

for token in doc:
    print (token.text, token.lemma_, token.pos_, token.tag_, token.tag, token.dep_, token.shape_, token.is_alpha, token.is_stop)

choose choose VERB VB 14200088355797579614 ROOT xxxx True False
tools tool NOUN NNS 783433942507015291 compound xxxx True False
- - PUNCT HYPH 8214596291009089021 punct - False False
autocorrect autocorrect NOUN NN 15308085513773655218 dobj xxxx True False
, , PUNCT , 2593208677638477497 punct , False False
and and CCONJ CC 17571114184892886314 cc xxx True True
ensure ensure VERB VB 14200088355797579614 conj xxxx True False
that that SCONJ IN 1292078113972184607 mark xxxx True True
while while SCONJ IN 1292078113972184607 mark xxxx True True
typing typing NOUN NN 15308085513773655218 nsubjpass xxxx True False
is be AUX VBZ 13927759927860985106 auxpass xx True True
selected select VERB VBN 3822385049556375858 ccomp xxxx True False
. . PUNCT . 12646065887601541794 punct . False False


## Example for an infinitive

In [5]:
doc = nlp("to only apply the new page style to a single page, select default.")

for token in doc:
    print (token.text, token.lemma_, token.pos_, token.tag_, token.tag, token.dep_, token.shape_, token.is_alpha, token.is_stop)

to to PART TO 5595707737748328492 aux xx True True
only only ADV RB 164681854541413346 advmod xxxx True True
apply apply VERB VB 14200088355797579614 advcl xxxx True False
the the DET DT 15267657372422890137 det xxx True True
new new ADJ JJ 10554686591937588953 amod xxx True False
page page NOUN NN 15308085513773655218 compound xxxx True False
style style NOUN NN 15308085513773655218 dobj xxxx True False
to to ADP IN 1292078113972184607 prep xx True True
a a DET DT 15267657372422890137 det x True True
single single ADJ JJ 10554686591937588953 amod xxxx True False
page page NOUN NN 15308085513773655218 pobj xxxx True False
, , PUNCT , 2593208677638477497 punct , False False
select select VERB VB 14200088355797579614 ROOT xxxx True False
default default NOUN NN 15308085513773655218 dobj xxxx True False
. . PUNCT . 12646065887601541794 punct . False False


## Example of a gerund

In [6]:
doc = nlp("double-click on an existing title text.")

for token in doc:
    print (token.text, token.lemma_, token.pos_, token.tag_, token.tag, token.dep_, token.shape_, token.is_alpha, token.is_stop)
print ('*'*30)

doc = nlp("to remove the number and the indent of the paragraph, click the numbering onoff icon on the formatting bar")

for token in doc:
    print (token.text, token.lemma_, token.pos_, token.tag_, token.tag, token.dep_, token.shape_, token.is_alpha, token.is_stop)

double double ADJ JJ 10554686591937588953 amod xxxx True False
- - PUNCT HYPH 8214596291009089021 punct - False False
click click NOUN NN 15308085513773655218 ROOT xxxx True False
on on ADP IN 1292078113972184607 prep xx True True
an an DET DT 15267657372422890137 det xx True True
existing exist VERB VBG 1534113631682161808 amod xxxx True False
title title NOUN NN 15308085513773655218 compound xxxx True False
text text NOUN NN 15308085513773655218 pobj xxxx True False
. . PUNCT . 12646065887601541794 punct . False False
******************************
to to PART TO 5595707737748328492 aux xx True True
remove remove VERB VB 14200088355797579614 advcl xxxx True False
the the DET DT 15267657372422890137 det xxx True True
number number NOUN NN 15308085513773655218 dobj xxxx True False
and and CCONJ CC 17571114184892886314 cc xxx True True
the the DET DT 15267657372422890137 det xxx True True
indent indent NOUN NN 15308085513773655218 conj xxxx True False
of of ADP IN 1292078113972184607 pre

Let's extract the features which are unique to imperatives and infinitives
- Number of sentences starting with a verb
- Number of sentences without a subject
- Number of infinitive verbs
- Number of gerunds

- Average length of steps in a procedure

In [7]:
from spacy.symbols import nsubj, VERB, PART
from ast import literal_eval
from spacy.attrs import DEP, POS, LENGTH, TAG
TO = 5595707737748328492 #infinitive to id 
VBG = 1534113631682161808 #gerund id

Each procedure/non-procedure is contains text of instructions/steps separated by a delimiter ' <st\> '. 

In [9]:
train_df['Lists'][0]

'choose view - styles . <st> click the page styles icon. <st> right-click a page style and choose new . the new page style initially gets all properties of the selected page style. <st> on the organizer tab page, type a name for the page style in the name box, for example "my landscape". <st> in the next style box, select the page style that you want to apply to the next page that follows a page with the new style. see the section about the scope of page styles at the end of this help page. <st> click the page tab. <st> under paper format , select \'portrait\' or \'landscape\'. <st> click ok .'

Let's split them to extract each step into a list. 

In [11]:
train_lists = train_df['Lists'].apply(lambda x:x.split(' <st> ')).tolist()
test_lists = train_df['Lists'].apply(lambda x:x.split(' <st> ')).tolist()
train_lists[0]

['choose view - styles .',
 'click the page styles icon.',
 'right-click a page style and choose new . the new page style initially gets all properties of the selected page style.',
 'on the organizer tab page, type a name for the page style in the name box, for example "my landscape".',
 'in the next style box, select the page style that you want to apply to the next page that follows a page with the new style. see the section about the scope of page styles at the end of this help page.',
 'click the page tab.',
 "under paper format , select 'portrait' or 'landscape'.",
 'click ok .']

In [12]:
list_verb = []
list_subj = []
list_gerund = []
list_infinitive = []
avg_len = []

In [13]:
for li in train_lists:
    doc_arrays = [doc.to_array([DEP, POS, TAG]) for doc in map(nlp, li)]
    list_subj.append(sum(nsubj not in doc_array[:,0] and doc_array.shape[0]>2 for doc_array in doc_arrays))
    list_verb.append(sum(VERB == doc_array[0,1] and doc_array.shape[0]>1 for doc_array in doc_arrays))
    list_gerund.append(sum(VBG in doc_array[:,2] and VERB in doc_array[:,1] for doc_array in doc_arrays))
    list_infinitive.append(sum(TO in doc_array[:,2] and PART in doc_array[:,1] for doc_array in doc_arrays))
    avg_len.append(sum(list(map(lambda x:x.shape[0], doc_arrays)))/len(li))

In [18]:
df1 = pd.DataFrame({"Lists":train_df['Lists'], "Sents-No Subject":list_subj, "Sents-Starts with Verb":list_verb, "Avg Length":avg_len, "Gerunds": list_gerund, "Infinitives": list_infinitive, "Labels": train_df['Labels']}) 
df1.to_csv(os.path.join(DATA_PATH, "dense_features_train_procedures.csv"))

In [19]:
list_verb = []
list_subj = []
list_gerund = []
list_infinitive = []
avg_len = []
for li in test_lists:
    doc_arrays = [doc.to_array([DEP, POS, TAG]) for doc in map(nlp, li)]
    list_subj.append(sum(nsubj not in doc_array[:,0] and doc_array.shape[0]>2 for doc_array in doc_arrays))
    list_verb.append(sum(VERB == doc_array[0,1] and doc_array.shape[0]>1 for doc_array in doc_arrays))
    list_gerund.append(sum(VBG in doc_array[:,2] and VERB in doc_array[:,1] for doc_array in doc_arrays))
    list_infinitive.append(sum(TO in doc_array[:,2] and PART in doc_array[:,1] for doc_array in doc_arrays))
    avg_len.append(sum(list(map(lambda x:x.shape[0], doc_arrays)))/len(li))

df1 = pd.DataFrame({"Lists":test_df['Lists'], "Sents-No Subject":list_subj, "Sents-Starts with Verb":list_verb, "Avg Length":avg_len, "Gerunds": list_gerund, "Infinitives": list_infinitive, "Labels": test_df['Labels']}) 
df1.to_csv(os.path.join(DATA_PATH, "dense_features_test_procedures.csv"))