# RE19- Linguistic datasets generation

The notebook takes in input a list of datasets and enrich them with linguistic features

In [0]:
!pip3 install cython numpy
!pip3 install skope-rules
!pip3 install benepar[cpu]

Collecting skope-rules
[?25l  Downloading https://files.pythonhosted.org/packages/56/b0/b56fb8d186f35089a469dc788c32ac99cf0276eae567736325b179b71db0/skope-rules-1.0.0.tar.gz (2.0MB)
[K    100% |████████████████████████████████| 2.0MB 13.1MB/s 
Building wheels for collected packages: skope-rules
  Building wheel for skope-rules (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/3e/8d/56/464f328ff3200c785626967ee39a6b2efc455469dab615f03e
Successfully built skope-rules
Installing collected packages: skope-rules
Successfully installed skope-rules-1.0.0
Collecting benepar[cpu]
[?25l  Downloading https://files.pythonhosted.org/packages/a0/7b/6cd9c60e1613a5ad388b4f883fa2aeaddcd8a7ad0a8d5ed87e0d23f159d8/benepar-0.1.2.tar.gz (72kB)
[K    100% |████████████████████████████████| 81kB 5.2MB/s 
Building wheels for collected packages: benepar
  Building wheel for benepar (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/c6/f5/06/d88543b19a

## 1. Imports

In [0]:
# Import skope-rules
from skrules import SkopeRules


# Import libraries
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve
from matplotlib import cm
import numpy as np
from sklearn.metrics import confusion_matrix
from IPython.display import display
import os


#Import basic NLTK tooling
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

#Import benepar parser
import benepar
benepar.download('benepar_en2')

#Tqdm, for the progress bar
from tqdm import tqdm

#Spacy
import spacy
nlp = spacy.load("en_core_web_sm")

from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package benepar_en2 to /root/nltk_data...


## 2. Constants and functions definition

In [0]:
def getDependenciesFeaturesSets(type):
  """
  Retrieves the set of linguistic features appropriate, based on the name of the feature set given as input.
  Nine groups of features, calculated in the notebook 04_ling_stats_calculator, are considered in this function:
  1. single dependencies
  2. combinations of 2 dependencies
  3. combinations of 3 dependencies
  4. single branches
  5. combinations of 2 branches
  6. combinations of 3 branches
  7. sequences of POSdep
  8. combinations of 2 sequences of POSdep
  9. combinations of 3 sequences of POSdep
  The following feature sets are defined:
  all: the top 10 features for each of groups 1-6, no features from group 7 (TOGETHER WITH ext IS USED IN THE PAPER AS FS3)
  sd: the single dependencies that appeared at least once in the top 10 features of groups 1-3, no feat from group 7 (USED IN THE PAPER AS FS1)
  sdsb: same as sd but for groups 1-6
  sdsb8sel02: the features with delta>0.2 in sdsb (TOGETHER WITH ext IS USED IN THE PAPER AS FS2)
  seq: the features in sdsb8sel02 + the top 10 features of groups 7-9
  ev: a merge of all the previous ones
  two: only dobj and nummod
  FinalSel: a final selection of 11 features (extended with 4 additional ones, outside this function is USED IN THE PAPER AS THE FINAL SET)
  N.B. the suffix ext refers to additional features added outside of this function
  @param type: one of the names of the feature sets above defined
  @return: the appropriate lists of features
  """
  if type=='all' or type=='allext':
    significant_dependencies = ['dobj', 'nummod', 'acl', 'amod', 'auxpass',
                                'advmod', 'nsubjpass', 'nsubj', 'nmod', 'advcl']
    significant_2dependencies = [['ROOT','nummod'], ['aux','nummod'], ['det','nummod'], 
                                  ['nummod','punct'], ['ROOT','dobj'], ['aux','dobj'], 
                                  ['nummod','pobj'], ['nsubj','dobj'], ['nsubj','nummod'], ['dobj','pobj']]
    significant_3dependencies = [['ROOT','nummod','punct'], ['aux','ROOT','nummod'], ['aux','nummod','punct'],
                                ['det','ROOT','nummod'], ['det','nummod','punct'], ['det','aux','nummod'], 
                                ['ROOT','det','dobj'], ['nsubj','det','dobj'], ['aux','det','dobj'], 
                                ['nsubj','aux','dobj']]

    significant_branches = ['ROOT_dobj_det','ROOT_dobj_acl_aux','ROOT_dobj_acl_dobj_det',
                          'ROOT_prep_pobj_det','ROOT_auxpass','ROOT_prep_pobj_compound','ROOT_nsubj',
                          'ROOT_ccomp_aux', 'ROOT_nsubj_nummod', 'ROOT_prep_pobj_nummod']

    significant_2branches = [['ROOT_dobj_det','ROOT_nsubj_det'],
                            ['ROOT_aux','ROOT_dobj_det'],
                            ['ROOT_dobj_det','ROOT_punct'],
                            ['ROOT_aux','ROOT_aux '],
                            ['ROOT_punct','ROOT_punct'],
                            ['ROOT_aux','ROOT_dobj_acl_aux'],
                            ['ROOT_dobj_acl_aux','ROOT_dobj_det'],
                            ['ROOT_dobj_acl_aux','ROOT_punct'],
                            ['ROOT_dobj_acl_aux','ROOT_nsubj_det'],
                            ['ROOT_prep_pobj_det','ROOT_punct']]

    significant_3branches = [['ROOT_aux','ROOT_dobj_det','ROOT_nsubj_det'],
                            ['ROOT_dobj_det','ROOT_nsubj_det','ROOT_punct'],
                            ['ROOT_aux','ROOT_dobj_det','ROOT_punct'],
                            ['ROOT_aux','ROOT_aux','ROOT_punct'],
                            ['ROOT_aux','ROOT_punct','ROOT_punct'],
                            ['ROOT_aux','ROOT_aux','ROOT_nsubj_det'],
                            ['ROOT_nsubj_det','ROOT_punct','ROOT_punct'],
                            ['ROOT_aux','ROOT_dobj_acl_aux','ROOT_dobj_det'],
                            ['ROOT_aux','ROOT_dobj_acl_aux','ROOT_punct'], 
                           ['ROOT_dobj_acl_aux','ROOT_dobj_det','ROOT_punct']]
    
    significant_sequences = []
    
  elif type=='sd' or type=='sdext':
    significant_dependencies = ['dobj', 'nummod', 'acl', 'amod', 'auxpass',
                              'advmod', 'nsubjpass', 'nsubj', 'nmod', 'aux', 'pobj', 'prep', 'det', 'punct']
    significant_2dependencies = []
    significant_3dependencies = []
    significant_branches = []
    significant_2branches = []
    significant_3branches = []
    significant_sequences = []
  elif type=='sdsb' or type=='sdsbext':
    significant_dependencies = ['dobj', 'nummod', 'acl', 'amod', 'auxpass',
                                'advmod', 'nsubjpass', 'nsubj', 'nmod', 'aux', 'pobj', 'prep', 'det', 'punct']
    significant_2dependencies = []
    significant_3dependencies = []
    significant_branches = ['ROOT_dobj_det','ROOT_dobj_acl_aux','ROOT_prep_pobj_det','ROOT_acomp_xcomp_aux','ROOT_nsubjpass_det',
                            'ROOT_dobj_acl_dobj_det','ROOT_prep_pobj_compound','ROOT_acomp_xcomp_dobj_det','ROOT_nsubj_det','ROOT_dobj_acl_aux']
    significant_2branches = []
    significant_3branches = []
    significant_sequences = []
  elif type=='sdsb8sel02' or type=='sdsb8sel02ext':
    significant_dependencies = ['dobj', 'acl', 'prep', 'det', 'pobj','aux', 'nsubj', 'punct']
    significant_2dependencies = []
    significant_3dependencies = []
    significant_branches = ['ROOT_aux','ROOT_dobj_det','ROOT_punct','ROOT_nsubj_det']
    significant_2branches = []
    significant_3branches = []
    significant_sequences = []
  elif type=='seq' or type=='seqext':
    significant_dependencies = ['dobj', 'acl', 'prep', 'det', 'pobj','aux', 'nsubj', 'punct']
    significant_2dependencies = []
    significant_3dependencies = []
    significant_branches = ['ROOT_aux','ROOT_dobj_det','ROOT_punct','ROOT_nsubj_det']
    significant_2branches = []
    significant_3branches = []
    significant_sequences = ['NNdobj', 'TOaux', 'NNPnsubj', 'RBadvmod', 'VBxcomp', 'VBauxpass', 'CDnummod', 'VBROOT', 'NNdobj_INprep', 'VBROOT_DTdet', 'DTdet_NNdobj', 'MDaux_VBROOT', 'JJacomp_TOaux', 'NNPnsubj_MDaux',
                            'MDaux_VBROOT_DTdet', 'VBROOT_DTdet_NNdobj', 'JJacomp_TOaux_VBxcomp', 'VBROOT_JJacomp_TOaux', 'MDaux_VBROOT_DTdet_NNdobj', 'MDaux_VBROOT_JJacomp_TOaux', 'VBROOT_JJacomp_TOaux_VBxcomp',
                            'MDaux', 'DTdet', 'NNnsubj', 'INprep']
  elif type=='ev' or type=='evext':
    significant_dependencies = ['dobj', 'nummod', 'acl', 'amod', 'auxpass',
                                'advmod', 'nsubjpass', 'nsubj', 'nmod', 'advcl', 'prep', 'det','pobj', 'aux', 'punct']
    significant_2dependencies = [['ROOT','nummod'], ['aux','nummod'], ['det','nummod'], 
                                  ['nummod','punct'], ['ROOT','dobj'], ['aux','dobj'], 
                                  ['nummod','pobj'], ['nsubj','dobj'], ['nsubj','nummod'], ['dobj','pobj']]
    significant_3dependencies = [['ROOT','nummod','punct'], ['aux','ROOT','nummod'], ['aux','nummod','punct'],
                                ['det','ROOT','nummod'], ['det','nummod','punct'], ['det','aux','nummod'], 
                                 ['ROOT','det','dobj'], ['nsubj','det','dobj'], ['aux','det','dobj'], 
                                 ['nsubj','aux','dobj']]

    significant_branches = ['ROOT_dobj_det', 'ROOT_acomp_xcomp_aux','ROOT_nsubjpass_det',
                            'ROOT_acomp_xcomp_dobj_det','ROOT_nsubj_det',
                            'ROOT_dobj_acl_aux',
                            'ROOT_dobj_acl_dobj_det',
                            'ROOT_prep_pobj_det',
                            'ROOT_auxpass',
                            'ROOT_prep_pobj_compound',
                            'ROOT_nsubj',
                            'ROOT_ccomp_aux',
                            'ROOT_nsubj_nummod',
                            'ROOT_prep_pobj_nummod', 'ROOT_aux', 'ROOT_punct']

    significant_2branches = [['ROOT_dobj_det','ROOT_nsubj_det'],
                            ['ROOT_aux','ROOT_dobj_det'],
                            ['ROOT_dobj_det','ROOT_punct'],
                            ['ROOT_aux','ROOT_aux '],
                            ['ROOT_punct','ROOT_punct'],
                            ['ROOT_aux','ROOT_dobj_acl_aux'],
                            ['ROOT_dobj_acl_aux','ROOT_dobj_det'],
                            ['ROOT_dobj_acl_aux','ROOT_punct'],
                            ['ROOT_dobj_acl_aux','ROOT_nsubj_det'],
                            ['ROOT_prep_pobj_det','ROOT_punct']]

    significant_3branches = [['ROOT_aux','ROOT_dobj_det','ROOT_nsubj_det'],
                            ['ROOT_dobj_det','ROOT_nsubj_det','ROOT_punct'],
                            ['ROOT_aux','ROOT_dobj_det','ROOT_punct'],
                            ['ROOT_aux','ROOT_aux','ROOT_punct'],
                            ['ROOT_aux','ROOT_punct','ROOT_punct'],
                            ['ROOT_aux','ROOT_aux','ROOT_nsubj_det'],
                            ['ROOT_nsubj_det','ROOT_punct','ROOT_punct'],
                            ['ROOT_aux','ROOT_dobj_acl_aux','ROOT_dobj_det'],
                            ['ROOT_aux','ROOT_dobj_acl_aux','ROOT_punct'], ['ROOT_dobj_acl_aux','ROOT_dobj_det','ROOT_punct']]
    
    significant_sequences = ['NNdobj', 'TOaux', 'NNPnsubj', 'RBadvmod', 'VBxcomp', 'VBauxpass', 'CDnummod', 'VBROOT', 'NNdobj_INprep', 'VBROOT_DTdet', 'DTdet_NNdobj', 'MDaux_VBROOT', 'JJacomp_TOaux', 'NNPnsubj_MDaux',
                            'MDaux_VBROOT_DTdet', 'VBROOT_DTdet_NNdobj', 'JJacomp_TOaux_VBxcomp', 'VBROOT_JJacomp_TOaux', 'MDaux_VBROOT_DTdet_NNdobj', 'MDaux_VBROOT_JJacomp_TOaux', 'VBROOT_JJacomp_TOaux_VBxcomp',
                            'MDaux', 'DTdet', 'NNnsubj', 'INprep']
  elif type=='two':
    significant_dependencies = ['dobj', 'nummod']
    significant_2dependencies = []
    significant_3dependencies = []
    significant_branches = []
    significant_2branches = []
    significant_3branches = []
    significant_sequences = []
    
  elif 'FinalSel' in type:
    significant_dependencies = ['nsubj', 'dobj', 'nummod', 'amod', 'acl', 'nmod', 'auxpass', 'nsubjpass', 'prep', 'pobj', 'advmod']
    significant_2dependencies = []
    significant_3dependencies = []
    significant_branches = []
    significant_2branches = []
    significant_3branches = []
    significant_sequences = []
    
  else:
    significant_dependencies = []
    significant_2dependencies = []
    significant_3dependencies = []
    significant_branches = []
    significant_2branches = []
    significant_3branches = []
    significant_sequences = []
    
    
  return significant_dependencies, significant_2dependencies, significant_3dependencies, significant_branches, significant_2branches, significant_3branches, significant_sequences



def get_all_paths(node, h, max_h):
    """
    Calculates all the dependencies paths (branches) in a requirement dependency tree up to an height of max_h
    @param node: the root of the tree
    @param h: the initial height (typically 0)
    @return: a list of strings representing paths
    """
    if node.n_lefts + node.n_rights == 0 or h==max_h:
        return [node.dep_]
    return [
        node.dep_ + '_' + str(path) for child in node.children for path in get_all_paths(child, h+1, max_h)
    ]


def createEnrichedDataset(data, new_file_name, dep_feat_type):
    """
    Creates a <new_file_name>.csv file with dataset data enriched with the features in dep_feat_type
    @param data: the original dataset
    @param new_file_name: the name of the new dataset
    @param dep_feat_type: the type of feature sets, see function getDependenciesFeaturesSets for a description
    """
    
    columns_to_keep = ['ProjectID','RequirementText','Class','IsFunctional','IsQuality']
    for c in data.columns:
        if not c in columns_to_keep:
            data = data.drop(c, axis = 1)
   
    # the presence of ext in dep_feat_type indicates that we want to extend the features obtained from function getDependenciesFeaturesSets 
    # with additional features from literature
    if "ext" in dep_feat_type:
      data['Length'] = 0
      idx = 0
      for x in data['RequirementText']:
          data.at[idx, 'Length'] = len(x)
          idx = idx + 1
      data['AdvMod'] = 0
      data['AMod'] = 0
      data['AComp'] = 0
      data['DTreeHeight'] = 0
      
    if "FinalSel" in dep_feat_type:
      data['AComp'] = 0
      
    # get the features to use 
    significant_dependencies, significant_2dependencies, significant_3dependencies, significant_branches, significant_2branches, significant_3branches, significant_sequences = getDependenciesFeaturesSets(dep_feat_type)
    
    # init columns of the dataframe for the appropriate features
    for d in significant_dependencies:
        data[d] = 0
    for c in significant_2dependencies:
        data[c[0]+'+'+c[1]] = 0
    for t in significant_3dependencies:
        data[t[0]+'+'+t[1]+'+'+t[2]] = 0
    for d in significant_branches:
        data[d] = 0
    for c in significant_2branches:
        data[c[0]+'+'+c[1]] = 0
    for t in significant_3branches:
        data[t[0]+'+'+t[1]+'+'+t[2]] = 0
    for s in significant_sequences:
      data[s] = 0

    # loop for all rows in the original dataset
    idx = 0
    for req in tqdm(data['RequirementText'], desc='spaCy analysis', position=0):
        token = tokenizer.tokenize(req)
        doc = nlp(req)
        printed = False
        maxHeight = 1
        req_dep = []
        req_tagged_seq = ''
        for t in doc:
            req_dep.append(t.dep_)
            req_tagged_seq = req_tagged_seq+t.tag_+t.dep_+"_"

        dep_br_lists = [get_all_paths(sent.root, 0, 15) for sent in doc.sents]
        dep_br = []
        for l in dep_br_lists:
            if l!=['ROOT']:
                dep_br = dep_br + l
        dep_br.sort()

        if "ext" in dep_feat_type:
          for sent in doc.sents:
              for token in sent:
                  height = 1
                  for t in token.ancestors:
                      height = height + 1
                  if height > maxHeight:
                      maxHeight = height

                  # TODO: Limit to Root verb?
                  if token.dep_ == 'advmod' and token.head.pos_ == 'VERB' and token.pos_ == 'ADV':
                      #print('Pattern 1: VB', token.head, '->', token.dep_, '-> RB', token.text)
                      data.at[idx, 'AdvMod'] = data.at[idx, 'AdvMod'] + 1

                  if token.dep_ == 'amod' and token.head.pos_ == 'NOUN' and token.pos_ == 'ADJ':
                      # Could be made stronger by making the head traversal recursive 
                      if token.head.dep_ == 'nsubj':
                          continue
                      #print('Pattern 2: NN', token.head, '->', token.dep_, '-> ADJ', token.text)  
                      data.at[idx, 'AMod'] = data.at[idx, 'AMod'] + 1

                  if token.dep_ == 'acomp' and token.head.pos_ == 'VERB' and token.pos_ == 'ADJ':
                      if token.text == 'able':
                          continue
                      #print('Pattern 3: VB', token.head, '->', token.dep_, '-> ADJ', token.text)
                      data.at[idx, 'AComp'] = data.at[idx, 'AComp'] + 1

          # Max height of the dependency tree of a sentence of a given requirement
          data.at[idx, 'DTreeHeight'] = maxHeight
        
        if "FinalSel" in dep_feat_type:
          for sent in doc.sents:
              for token in sent:
                  height = 1
                  for t in token.ancestors:
                      height = height + 1
                  if height > maxHeight:
                      maxHeight = height

                  if token.dep_ == 'acomp' and token.head.pos_ == 'VERB' and token.pos_ == 'ADJ':
                      if token.text == 'able':
                          continue
                      #print('Pattern 3: VB', token.head, '->', token.dep_, '-> ADJ', token.text)
                      data.at[idx, 'AComp'] = data.at[idx, 'AComp'] + 1

        for d in significant_dependencies:
            if d in req_dep:
                data.at[idx, d] = data.at[idx, d] + 1
        for c in significant_2dependencies:
            if c[0] in req_dep and c[1] in req_dep:
                data.at[idx, c[0]+'+'+c[1]] = data.at[idx, c[0]+'+'+c[1]] +1
        for t in significant_3dependencies:
            if t[0] in req_dep and t[1] in req_dep and t[2] in req_dep:
                data.at[idx, t[0]+'+'+t[1]+'+'+t[2]] = data.at[idx, t[0]+'+'+t[1]+'+'+t[2]] +1

        for d in significant_branches:
            if d in dep_br:
                data.at[idx, d] = data.at[idx, d] + 1
        for c in significant_2branches:
            if c[0] in dep_br and c[1] in dep_br:
                data.at[idx, c[0]+'+'+c[1]] = data.at[idx, c[0]+'+'+c[1]] +1
        for t in significant_3branches:
            if t[0] in dep_br and t[1] in dep_br and t[2] in dep_br:
                data.at[idx, t[0]+'+'+t[1]+'+'+t[2]] = data.at[idx, t[0]+'+'+t[1]+'+'+t[2]] +1
              
        for s in significant_sequences:
            if s in req_tagged_seq:
              data.at[idx, s] = data.at[idx, s] + 1

        idx = idx + 1

    if "ext" in dep_feat_type:
      parser = benepar.Parser("benepar_en2")
      data['Modal'] = 0
      data['Adjective'] = 0
      data['Noun'] = 0
      data['Adverb'] = 0
      data['Cardinal'] = 0
      data['CompSupAdj'] = 0
      data['CompSupAdv'] = 0
      data['Words'] = 0
      data['TreeHeight'] = 0
      data['SubTrees'] = 0
      idx = 0
      for req in tqdm(data['RequirementText'], desc='Parse trees', position=0):
          tokens = tokenizer.tokenize(req)
          data.at[idx, 'Words'] = len(tokens)
          #using nltk here but analogous to universal tags
          tags = nltk.pos_tag(tokens)
          fd = nltk.FreqDist(tag for (word, tag) in tags)
          for key, value in fd.items():
              #print (key + " " + str(value))
              if key=="MD":
                  data.at[idx, 'Modal'] = value
              if key.startswith("JJ"):
                  data.at[idx, 'Adjective'] = value
              if key.startswith("NN"):
                  data.at[idx, 'Noun'] = value
              if key=="RB":
                  data.at[idx, 'Adverb'] = value
              if key=="CD":
                  data.at[idx, 'Cardinal'] = value
              if key=="JJR" or key=="JJS":
                  data.at[idx, 'CompSupAdj'] = data.at[idx, 'CompSupAdj'] + value
              if key=="RBR" or key=="RBS":
                  data.at[idx, 'CompSupAdv'] = data.at[idx, 'CompSupAdv'] + value
          tree = parser.parse(req)
          #print (tree.height(), end =" ")
          data.at[idx, 'TreeHeight'] = tree.height()
          data.at[idx, 'SubTrees'] = len(tree)
          idx = idx + 1 
          
    if "FinalSel" in dep_feat_type:
      parser = benepar.Parser("benepar_en2")
      data['Modal'] = 0
      data['Adverb'] = 0
      data['Cardinal'] = 0
      idx = 0
      for req in tqdm(data['RequirementText'], desc='Parse trees', position=0):
          tokens = tokenizer.tokenize(req)
          tags = nltk.pos_tag(tokens)
          fd = nltk.FreqDist(tag for (word, tag) in tags)
          for key, value in fd.items():
              #print (key + " " + str(value))
              if key=="MD":
                  data.at[idx, 'Modal'] = value
              if key=="RB":
                  data.at[idx, 'Adverb'] = value
              if key=="CD":
                  data.at[idx, 'Cardinal'] = value
          idx = idx + 1 
          
         
    # enrichment with features for root verbs (one feature per verb)
    if "verb" in dep_feat_type:
      #first version tried 
      verbs_features = ['be', 'use', 'interface', 'comply', 'run',
                        'allow', 'display', 'send', 'track', 'include', 'notify', 'add', 'assign', 'request', 'record', 'indicate']
      #second version 
      verbs_features = ['be', 'use', 'ensure', 'interface', 'handle', 'take', 'comply', 'run']
      #third version
      verbs_features = ['be', 'use', 'ensure', 'interface', 'handle', 'take', 'comply', 'run',
                        'allow', 'display', 'send', 'track', 'include', 'notify', 'shall', 'add', 'assign', 'generate', 'request',
                        'create', 'define', 'record', 'indicate', 'save'
                     ]
      for verb in verbs_features:
        data[verb] = 0
        
      idx = 0
      for req in tqdm(data['RequirementText'], desc='Analyzing verbs', position=0):
        newr = req.replace("'", "").replace('be able to', '').replace('be capable of', '').replace('provide the ability to', '').replace('be possible to', '')
        doc = nlp(newr)
        for t in doc:
          if t.dep_=='ROOT':
            req_root = lemmatizer(t.orth_, t.pos_)[0]
            for verb in verbs_features:
              if req_root==verb:
                data.at[idx, verb] += 1
                break
        idx = idx + 1 
        
    # boolean features for root verbs (each feature takes val 1 if the req contains at least one verb in the corresponding list)
    # USED IN THE PAPER AS LAST FEATURE SET
    if "vlist" in dep_feat_type:
      Fverbs = ['allow', 'display', 'send', 'track', 'include', 'notify', 'shall', 'add', 'assign', 'generate', 'request', 'create', 'define', 'record', 'indicate', 'save']
      Qverbs = ['be', 'use', 'ensure', 'interface', 'handle', 'take', 'comply', 'run']
      
      data['hasFverb'] = 0
      data['hasQverb'] = 0
        
      idx = 0
      for req in tqdm(data['RequirementText'], desc='Analyzing verbs', position=0):
        newr = req.replace("'", "").replace('be able to', '').replace('be capable of', '').replace('provide the ability to', '').replace('be possible to', '')
        doc = nlp(newr)
        for t in doc:
          if t.dep_=='ROOT':
            req_root = lemmatizer(t.orth_, t.pos_)[0]
            if req_root in Fverbs:
              data.at[idx, 'hasFverb'] = 1
            if req_root in Qverbs:
              data.at[idx, 'hasQverb'] = 1
        idx = idx + 1 

    # print(data[:30])

    #finally save the enriched datasetfile
    data.to_csv(new_file_name, encoding='utf-8')

### 3. Datasets enrichment

In [0]:
folder_source_datasets = './' #can be an url
folder_dest_datasets = './ling/'

#creates a folder that will contain the enriched datasets
try: 
    if not os.path.isdir(folder_dest_datasets):
      os.mkdir(folder_dest_datasets)
except OSError:  
    print ("Creation of the directory %s failed" % folder_dest_datasets)
    exit()
else:  
    print ("Successfully created the directory %s " % folder_dest_datasets)
    
    
dataset_names = ['promise-reclass', 'ds2', 'ds3', 'dronology', 'wasp', 'esa-eucl-est', 'leeds', 'reqview', 'INDcombined', '8combined']
datasets = [pd.read_csv(folder_source_datasets+dataset_name+'.csv', engine='python') for dataset_name in dataset_names] 

#the features to use to enrich the datasets
possible_dependencies_feature_sets = ['FinalSel_vlist', 'FinalSel_verb', 'two', 'all', 'allext', 'sd', 'sdext', 'sdsb','sdsbext', 'sdsb8sel02', 'sdsb8sel02ext', 'seqext', 'evext', 'FinalSel', 'FinalSel7']

#creat all enriched datasets
for i in range(0, len(datasets)):
    print('Dataset: '+dataset_names[i])
    for t in possible_dependencies_feature_sets:
      print(t)
      createEnrichedDataset(datasets[i], folder_dest_datasets+dataset_names[i]+'-ling-'+t+'.csv',t)

spaCy analysis:   1%|          | 5/625 [00:00<00:14, 44.01it/s]

Successfully created the directory ./ling/ 
Dataset: promise-reclass
FinalSel_verb


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 45.99it/s]
Parse trees: 100%|██████████| 625/625 [00:00<00:00, 923.49it/s]
Analyzing verbs: 100%|██████████| 625/625 [00:12<00:00, 51.57it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:12, 49.46it/s]

two


spaCy analysis: 100%|██████████| 625/625 [00:12<00:00, 48.90it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:14, 41.94it/s]

all


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 47.35it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:14, 42.00it/s]

allext


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 47.24it/s]
Parse trees: 100%|██████████| 625/625 [03:11<00:00,  2.08it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:14, 43.57it/s]

sd


spaCy analysis: 100%|██████████| 625/625 [00:12<00:00, 48.34it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:13, 45.57it/s]

sdext


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 37.12it/s]
Parse trees: 100%|██████████| 625/625 [03:10<00:00,  2.08it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:14, 43.71it/s]

sdsb


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 47.58it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:13, 46.00it/s]

sdsbext


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 47.43it/s]
Parse trees: 100%|██████████| 625/625 [03:12<00:00,  2.03it/s]
spaCy analysis:   1%|          | 4/625 [00:00<00:16, 37.75it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 38.59it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:14, 41.83it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 47.44it/s]
Parse trees: 100%|██████████| 625/625 [03:03<00:00,  2.18it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:14, 43.78it/s]

seqext


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 46.95it/s]
Parse trees: 100%|██████████| 625/625 [03:04<00:00,  2.16it/s]
spaCy analysis:   1%|          | 4/625 [00:00<00:15, 39.83it/s]

evext


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 46.14it/s]
Parse trees: 100%|██████████| 625/625 [03:04<00:00,  2.19it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:14, 44.27it/s]

FinalSel


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 47.95it/s]
Parse trees: 100%|██████████| 625/625 [00:00<00:00, 913.15it/s]
spaCy analysis:   1%|          | 5/625 [00:00<00:14, 41.91it/s]

FinalSel7


spaCy analysis: 100%|██████████| 625/625 [00:13<00:00, 47.86it/s]
Parse trees: 100%|██████████| 625/625 [00:00<00:00, 914.10it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:04, 41.47it/s]

Dataset: ds2
FinalSel_verb


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 48.19it/s]
Parse trees: 100%|██████████| 172/172 [00:00<00:00, 840.28it/s]
Analyzing verbs: 100%|██████████| 172/172 [00:03<00:00, 51.40it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 44.50it/s]

two


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 49.24it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 45.24it/s]

all


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 46.56it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 42.62it/s]

allext


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 46.37it/s]
Parse trees: 100%|██████████| 172/172 [00:55<00:00,  3.51it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 44.12it/s]

sd


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 48.48it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 47.49it/s]

sdext


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 48.08it/s]
Parse trees: 100%|██████████| 172/172 [00:55<00:00,  3.52it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 43.05it/s]

sdsb


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 48.33it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 44.20it/s]

sdsbext


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 48.12it/s]
Parse trees: 100%|██████████| 172/172 [00:59<00:00,  3.25it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 42.30it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 48.68it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 46.04it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 46.41it/s]
Parse trees: 100%|██████████| 172/172 [00:55<00:00,  3.59it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:04, 41.66it/s]

seqext


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 46.68it/s]
Parse trees: 100%|██████████| 172/172 [00:55<00:00,  3.49it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:04, 40.42it/s]

evext


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 45.71it/s]
Parse trees: 100%|██████████| 172/172 [00:54<00:00,  3.55it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:03, 44.34it/s]

FinalSel


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 46.15it/s]
Parse trees: 100%|██████████| 172/172 [00:00<00:00, 865.08it/s]
spaCy analysis:   3%|▎         | 5/172 [00:00<00:04, 41.60it/s]

FinalSel7


spaCy analysis: 100%|██████████| 172/172 [00:03<00:00, 45.59it/s]
Parse trees: 100%|██████████| 172/172 [00:00<00:00, 738.05it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 53.29it/s]

Dataset: ds3
FinalSel_verb


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 51.39it/s]
Parse trees: 100%|██████████| 138/138 [00:00<00:00, 1097.55it/s]
Analyzing verbs: 100%|██████████| 138/138 [00:02<00:00, 54.00it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 58.01it/s]

two


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 52.24it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 57.60it/s]

all


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 51.52it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 57.10it/s]

allext


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 50.97it/s]
Parse trees: 100%|██████████| 138/138 [00:42<00:00,  3.96it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 54.30it/s]

sd


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 53.34it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 59.69it/s]

sdext


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 53.04it/s]
Parse trees: 100%|██████████| 138/138 [00:42<00:00,  3.90it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 55.59it/s]

sdsb


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 52.73it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 56.35it/s]

sdsbext


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 52.11it/s]
Parse trees: 100%|██████████| 138/138 [00:41<00:00,  4.13it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 53.24it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 51.98it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 53.87it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 51.58it/s]
Parse trees: 100%|██████████| 138/138 [00:42<00:00,  4.03it/s]
spaCy analysis:   4%|▍         | 6/138 [00:00<00:02, 52.68it/s]

seqext


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 52.02it/s]
Parse trees: 100%|██████████| 138/138 [00:44<00:00,  3.69it/s]
spaCy analysis:   4%|▎         | 5/138 [00:00<00:02, 47.80it/s]

evext


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 48.05it/s]
Parse trees: 100%|██████████| 138/138 [00:41<00:00,  4.11it/s]
spaCy analysis:   4%|▎         | 5/138 [00:00<00:02, 46.69it/s]

FinalSel


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 47.19it/s]
Parse trees: 100%|██████████| 138/138 [00:00<00:00, 982.13it/s]
spaCy analysis:   4%|▎         | 5/138 [00:00<00:02, 46.44it/s]

FinalSel7


spaCy analysis: 100%|██████████| 138/138 [00:02<00:00, 50.13it/s]
Parse trees: 100%|██████████| 138/138 [00:00<00:00, 1084.63it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 59.13it/s]

Dataset: dronology
FinalSel_verb


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 52.76it/s]
Parse trees: 100%|██████████| 97/97 [00:00<00:00, 1024.34it/s]
Analyzing verbs: 100%|██████████| 97/97 [00:01<00:00, 54.13it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 57.20it/s]

two


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 53.38it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 59.56it/s]

all


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 52.48it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 59.07it/s]

allext


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 51.14it/s]
Parse trees: 100%|██████████| 97/97 [00:32<00:00,  3.78it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 52.40it/s]

sd


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 50.63it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 59.17it/s]

sdext


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 52.82it/s]
Parse trees: 100%|██████████| 97/97 [00:33<00:00,  3.67it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 55.75it/s]

sdsb


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 52.27it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 59.29it/s]

sdsbext


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 52.93it/s]
Parse trees: 100%|██████████| 97/97 [00:33<00:00,  3.39it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 53.73it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 48.94it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 55.29it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 48.50it/s]
Parse trees: 100%|██████████| 97/97 [00:32<00:00,  3.80it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 53.87it/s]

seqext


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 52.86it/s]
Parse trees: 100%|██████████| 97/97 [00:32<00:00,  3.87it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 54.91it/s]

evext


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 51.17it/s]
Parse trees: 100%|██████████| 97/97 [00:32<00:00,  3.81it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 58.90it/s]

FinalSel


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 53.58it/s]
Parse trees: 100%|██████████| 97/97 [00:00<00:00, 1020.61it/s]
spaCy analysis:   6%|▌         | 6/97 [00:00<00:01, 58.63it/s]

FinalSel7


spaCy analysis: 100%|██████████| 97/97 [00:01<00:00, 52.57it/s]
Parse trees: 100%|██████████| 97/97 [00:00<00:00, 1017.87it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 46.31it/s]

Dataset: wasp
FinalSel_verb


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 47.80it/s]
Parse trees: 100%|██████████| 62/62 [00:00<00:00, 776.76it/s]
Analyzing verbs: 100%|██████████| 62/62 [00:01<00:00, 50.52it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 47.08it/s]

two


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 48.67it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 47.70it/s]

all


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 47.51it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 46.29it/s]

allext


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 46.26it/s]
Parse trees: 100%|██████████| 62/62 [00:24<00:00,  3.14it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 44.06it/s]

sd


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 47.76it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 46.45it/s]

sdext


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 47.35it/s]
Parse trees: 100%|██████████| 62/62 [00:24<00:00,  2.97it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 42.92it/s]

sdsb


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 46.48it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 46.58it/s]

sdsbext


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 47.51it/s]
Parse trees: 100%|██████████| 62/62 [00:24<00:00,  3.14it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 44.78it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 47.52it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 47.57it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 47.07it/s]
Parse trees: 100%|██████████| 62/62 [00:24<00:00,  3.14it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 44.84it/s]

seqext


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 47.42it/s]
Parse trees: 100%|██████████| 62/62 [00:24<00:00,  3.10it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 42.95it/s]

evext


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 45.98it/s]
Parse trees: 100%|██████████| 62/62 [00:25<00:00,  2.95it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 44.43it/s]

FinalSel


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 47.26it/s]
Parse trees: 100%|██████████| 62/62 [00:00<00:00, 773.60it/s]
spaCy analysis:   8%|▊         | 5/62 [00:00<00:01, 45.93it/s]

FinalSel7


spaCy analysis: 100%|██████████| 62/62 [00:01<00:00, 48.54it/s]
Parse trees: 100%|██████████| 62/62 [00:00<00:00, 750.29it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:05, 45.27it/s]

Dataset: esa-eucl-est
FinalSel_verb


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 44.65it/s]
Parse trees: 100%|██████████| 236/236 [00:00<00:00, 664.03it/s]
Analyzing verbs: 100%|██████████| 236/236 [00:05<00:00, 45.33it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:04, 47.02it/s]

two


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 42.85it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:04, 46.98it/s]

all


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 43.13it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:04, 49.79it/s]

allext


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 44.91it/s]
Parse trees: 100%|██████████| 236/236 [01:23<00:00,  3.24it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:05, 45.86it/s]

sd


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 44.83it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:04, 49.22it/s]

sdext


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 44.66it/s]
Parse trees: 100%|██████████| 236/236 [01:24<00:00,  3.30it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:04, 46.77it/s]

sdsb


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 45.17it/s]
spaCy analysis:   3%|▎         | 6/236 [00:00<00:04, 51.40it/s]

sdsbext


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 45.02it/s]
Parse trees: 100%|██████████| 236/236 [01:24<00:00,  3.22it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:05, 44.57it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 43.26it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:04, 47.73it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 43.88it/s]
Parse trees: 100%|██████████| 236/236 [01:25<00:00,  2.98it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:05, 41.57it/s]

seqext


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 43.15it/s]
Parse trees: 100%|██████████| 236/236 [01:25<00:00,  3.18it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:05, 45.76it/s]

evext


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 43.47it/s]
Parse trees: 100%|██████████| 236/236 [01:25<00:00,  3.24it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:04, 46.85it/s]

FinalSel


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 44.49it/s]
Parse trees: 100%|██████████| 236/236 [00:00<00:00, 658.17it/s]
spaCy analysis:   2%|▏         | 5/236 [00:00<00:05, 45.55it/s]

FinalSel7


spaCy analysis: 100%|██████████| 236/236 [00:05<00:00, 44.60it/s]
Parse trees: 100%|██████████| 236/236 [00:00<00:00, 658.08it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 53.32it/s]

Dataset: leeds
FinalSel_verb


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 51.17it/s]
Parse trees: 100%|██████████| 85/85 [00:00<00:00, 863.91it/s]
Analyzing verbs: 100%|██████████| 85/85 [00:01<00:00, 51.68it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 53.69it/s]

two


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 53.02it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 56.33it/s]

all


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 49.85it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 55.25it/s]

allext


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 50.22it/s]
Parse trees: 100%|██████████| 85/85 [00:30<00:00,  4.01it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 51.93it/s]

sd


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 50.60it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 54.17it/s]

sdext


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 49.24it/s]
Parse trees: 100%|██████████| 85/85 [00:29<00:00,  4.10it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 53.69it/s]

sdsb


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 52.99it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 54.38it/s]

sdsbext


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 51.39it/s]
Parse trees: 100%|██████████| 85/85 [00:29<00:00,  4.09it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 50.82it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 49.43it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 55.10it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 49.66it/s]
Parse trees: 100%|██████████| 85/85 [00:30<00:00,  3.90it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 51.21it/s]

seqext


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 49.27it/s]
Parse trees: 100%|██████████| 85/85 [00:30<00:00,  3.96it/s]
spaCy analysis:   6%|▌         | 5/85 [00:00<00:01, 48.08it/s]

evext


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 49.66it/s]
Parse trees: 100%|██████████| 85/85 [00:30<00:00,  3.93it/s]
spaCy analysis:   6%|▌         | 5/85 [00:00<00:01, 49.14it/s]

FinalSel


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 49.17it/s]
Parse trees: 100%|██████████| 85/85 [00:00<00:00, 872.68it/s]
spaCy analysis:   7%|▋         | 6/85 [00:00<00:01, 51.51it/s]

FinalSel7


spaCy analysis: 100%|██████████| 85/85 [00:01<00:00, 50.20it/s]
Parse trees: 100%|██████████| 85/85 [00:00<00:00, 855.95it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 54.57it/s]

Dataset: reqview
FinalSel_verb


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 52.24it/s]
Parse trees: 100%|██████████| 87/87 [00:00<00:00, 888.01it/s]
Analyzing verbs: 100%|██████████| 87/87 [00:01<00:00, 54.83it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 56.55it/s]

two


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 52.14it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 55.65it/s]

all


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 52.23it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 58.66it/s]

allext


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 52.35it/s]
Parse trees: 100%|██████████| 87/87 [00:28<00:00,  3.95it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 57.56it/s]

sd


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 53.49it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 54.48it/s]

sdext


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 53.22it/s]
Parse trees: 100%|██████████| 87/87 [00:28<00:00,  3.82it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 54.54it/s]

sdsb


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 51.41it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 51.43it/s]

sdsbext


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 52.72it/s]
Parse trees: 100%|██████████| 87/87 [00:28<00:00,  3.93it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 57.36it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 52.88it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 56.40it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 52.71it/s]
Parse trees: 100%|██████████| 87/87 [00:29<00:00,  3.90it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 55.41it/s]

seqext


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 51.43it/s]
Parse trees: 100%|██████████| 87/87 [00:30<00:00,  3.62it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 52.92it/s]

evext


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 51.18it/s]
Parse trees: 100%|██████████| 87/87 [00:28<00:00,  3.85it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 54.33it/s]

FinalSel


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 52.80it/s]
Parse trees: 100%|██████████| 87/87 [00:00<00:00, 922.87it/s]
spaCy analysis:   7%|▋         | 6/87 [00:00<00:01, 55.18it/s]

FinalSel7


spaCy analysis: 100%|██████████| 87/87 [00:01<00:00, 52.92it/s]
Parse trees: 100%|██████████| 87/87 [00:00<00:00, 918.81it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:19, 45.04it/s]

Dataset: INDcombined
FinalSel_verb


spaCy analysis: 100%|██████████| 877/877 [00:17<00:00, 49.07it/s]
Parse trees: 100%|██████████| 877/877 [00:01<00:00, 846.31it/s]
Analyzing verbs: 100%|██████████| 877/877 [00:17<00:00, 55.96it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:18, 47.58it/s]

two


spaCy analysis: 100%|██████████| 877/877 [00:17<00:00, 49.55it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:20, 43.21it/s]

all


spaCy analysis: 100%|██████████| 877/877 [00:18<00:00, 46.36it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:19, 43.97it/s]

allext


spaCy analysis: 100%|██████████| 877/877 [00:18<00:00, 52.64it/s]
Parse trees: 100%|██████████| 877/877 [04:16<00:00,  3.85it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:21, 41.18it/s]

sd


spaCy analysis: 100%|██████████| 877/877 [00:17<00:00, 49.44it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:18, 46.67it/s]

sdext


spaCy analysis: 100%|██████████| 877/877 [00:18<00:00, 53.33it/s]
Parse trees: 100%|██████████| 877/877 [04:17<00:00,  3.78it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:21, 40.89it/s]

sdsb


spaCy analysis: 100%|██████████| 877/877 [00:17<00:00, 48.75it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:19, 45.66it/s]

sdsbext


spaCy analysis: 100%|██████████| 877/877 [00:18<00:00, 48.45it/s]
Parse trees: 100%|██████████| 877/877 [04:08<00:00,  3.99it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:20, 42.20it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 877/877 [00:18<00:00, 53.51it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:20, 42.65it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 877/877 [00:17<00:00, 53.01it/s]
Parse trees: 100%|██████████| 877/877 [04:10<00:00,  3.98it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:20, 41.53it/s]

seqext


spaCy analysis: 100%|██████████| 877/877 [00:18<00:00, 51.29it/s]
Parse trees: 100%|██████████| 877/877 [04:14<00:00,  3.97it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:21, 40.17it/s]

evext


spaCy analysis: 100%|██████████| 877/877 [00:18<00:00, 47.17it/s]
Parse trees: 100%|██████████| 877/877 [04:12<00:00,  4.00it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:21, 41.21it/s]

FinalSel


spaCy analysis: 100%|██████████| 877/877 [00:18<00:00, 52.99it/s]
Parse trees: 100%|██████████| 877/877 [00:01<00:00, 842.57it/s]
spaCy analysis:   1%|          | 5/877 [00:00<00:19, 45.15it/s]

FinalSel7


spaCy analysis: 100%|██████████| 877/877 [00:17<00:00, 48.86it/s]
Parse trees: 100%|██████████| 877/877 [00:01<00:00, 851.64it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:33, 45.35it/s]

Dataset: 8combined
FinalSel_verb


spaCy analysis: 100%|██████████| 1502/1502 [00:30<00:00, 48.62it/s]
Parse trees: 100%|██████████| 1502/1502 [00:01<00:00, 866.52it/s]
Analyzing verbs: 100%|██████████| 1502/1502 [00:29<00:00, 50.54it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:32, 45.90it/s]

two


spaCy analysis: 100%|██████████| 1502/1502 [00:31<00:00, 48.15it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:36, 41.23it/s]

all


spaCy analysis: 100%|██████████| 1502/1502 [00:31<00:00, 50.16it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:36, 40.78it/s]

allext


spaCy analysis: 100%|██████████| 1502/1502 [00:31<00:00, 52.26it/s]
Parse trees: 100%|██████████| 1502/1502 [07:07<00:00,  3.80it/s]
spaCy analysis:   0%|          | 4/1502 [00:00<00:37, 39.86it/s]

sd


spaCy analysis: 100%|██████████| 1502/1502 [00:32<00:00, 46.00it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:32, 46.76it/s]

sdext


spaCy analysis: 100%|██████████| 1502/1502 [00:55<00:00, 26.83it/s]
Parse trees: 100%|██████████| 1502/1502 [07:49<00:00,  3.89it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:33, 44.14it/s]

sdsb


spaCy analysis: 100%|██████████| 1502/1502 [00:31<00:00, 47.44it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:34, 43.82it/s]

sdsbext


spaCy analysis: 100%|██████████| 1502/1502 [00:31<00:00, 47.35it/s]
Parse trees: 100%|██████████| 1502/1502 [09:45<00:00,  3.54it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:35, 42.41it/s]

sdsb8sel02


spaCy analysis: 100%|██████████| 1502/1502 [00:31<00:00, 51.81it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:33, 45.04it/s]

sdsb8sel02ext


spaCy analysis: 100%|██████████| 1502/1502 [00:31<00:00, 52.83it/s]
Parse trees: 100%|██████████| 1502/1502 [07:17<00:00,  3.87it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:37, 40.08it/s]

seqext


spaCy analysis: 100%|██████████| 1502/1502 [00:31<00:00, 47.52it/s]
Parse trees: 100%|██████████| 1502/1502 [07:17<00:00,  3.64it/s]
spaCy analysis:   0%|          | 4/1502 [00:00<00:38, 39.10it/s]

evext


spaCy analysis: 100%|██████████| 1502/1502 [00:32<00:00, 50.80it/s]
Parse trees: 100%|██████████| 1502/1502 [07:14<00:00,  3.94it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:34, 43.48it/s]

FinalSel


spaCy analysis: 100%|██████████| 1502/1502 [00:30<00:00, 54.47it/s]
Parse trees: 100%|██████████| 1502/1502 [00:01<00:00, 877.26it/s]
spaCy analysis:   0%|          | 5/1502 [00:00<00:33, 44.94it/s]

FinalSel7


spaCy analysis: 100%|██████████| 1502/1502 [00:31<00:00, 48.02it/s]
Parse trees: 100%|██████████| 1502/1502 [00:01<00:00, 864.62it/s]


In [0]:
#to save all in once, since jupyter does not allow to download multiple files together
!zip ALL_DATASETS.zip -r ling/*

  adding: ling/8combined-ling-all.csv (deflated 83%)
  adding: ling/8combined-ling-allext.csv (deflated 81%)
  adding: ling/8combined-ling-evext.csv (deflated 82%)
  adding: ling/8combined-ling-FinalSel7.csv (deflated 76%)
  adding: ling/8combined-ling-FinalSel.csv (deflated 76%)
  adding: ling/8combined-ling-FinalSel_verb.csv (deflated 80%)
  adding: ling/8combined-ling-FinalSel_vlist.csv (deflated 76%)
  adding: ling/8combined-ling-sd.csv (deflated 76%)
  adding: ling/8combined-ling-sdext.csv (deflated 75%)
  adding: ling/8combined-ling-sdsb8sel02.csv (deflated 76%)
  adding: ling/8combined-ling-sdsb8sel02ext.csv (deflated 75%)
  adding: ling/8combined-ling-sdsb.csv (deflated 78%)
  adding: ling/8combined-ling-sdsbext.csv (deflated 76%)
  adding: ling/8combined-ling-seqext.csv (deflated 78%)
  adding: ling/8combined-ling-two.csv (deflated 74%)
  adding: ling/dronology-ling-all.csv (deflated 81%)
  adding: ling/dronology-ling-allext.csv (deflated 79%)
  adding: ling/dronology-ling-eve