# Linguistic Features Stats Calculator
This file calculates stats for different linguistic features in the requirements in a set of datasets given as input

Section 1. Imports and util functions

Section 2. Stats calculation

## 1. Imports and Util functions

In [0]:
import spacy
from collections import defaultdict
from collections import OrderedDict
import pandas as pd
from tqdm import tqdm #progress bar
import numpy as np
import itertools
from nltk import Tree
from copy import copy, deepcopy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

# load the english language for spacy
nlp = spacy.load('en')

# constants used to determine the type of linguistic feature to analyze
DEP_TYPE = 'Dependencies'
BR_TYPE = 'Branches'
SEQ_TYPE = 'Sequences'
VERB_TYPE = 'Verbs'


def count_req_types_frequencies(data):
    """
    Counts
    num_tot: the nr. of req in the dataset (the size of the dataset) data
    num_F: the nr. of req annotated as functional in the dataset
    num_Q: the nr. of req annotated as quality
    num_FandQ: the nr. of req annotated as both functional and quality
    num_FnotQ: the nr. of req annotated as functional but not quality
    num_QnotF: the nr. of req annotated as quality but not functional
    num_I: the nr. of req annotated neither as quality nor as functional
    @param data: the dataset (a DataFrame)
    @return: a tuple with the values above described
    """
    num_tot = 0.
    num_F = 0.
    num_Q = 0.
    num_FandQ = 0.
    num_FnotQ = 0.
    num_QnotF = 0.
    num_I = 0.
    
    idx = 0
    for x in data['RequirementText']:
        num_tot +=1
        if data.at[idx, 'IsFunctional'] == 1:
            num_F +=1
            if data.at[idx, 'IsQuality'] == 1:
                num_FandQ +=1
                num_Q +=1
            else:
                num_FnotQ +=1
        else:
            if data.at[idx, 'IsQuality'] == 1:
                num_QnotF +=1
                num_Q +=1
            else:
                num_I +=1

        idx += 1
    
    return num_tot, num_F, num_Q, num_FandQ, num_FnotQ, num_QnotF, num_I
     
        
        
def pprint_dep_stats(header_length, d):
    """
    Sorts the features by descending abs(cov_FnotQ-cov_QnotF), 
    with cov_FnotQ and cov_QnotF respectively the % of req F but not Q and % of req Q but not F with the i-feature
    Then prints in a readable way the stats for the first header_length most significant (w.r.t. the sorting) features
    """
    
    print("{:<12} {:<12} {:<12} {:<12} {:<12} {:<12} {:<12} {:<12}"
      .format('Feat', 'Tot (cov)', 'Tot F (cov)', 'Tot Q (cov)', 
              'FandQ (cov)', 'FnotQ (cov)', 'QnotF (cov)', 'I (cov)'))

    
    #sorting the array by abs(cov_FnotQ-cov_QnotF)
    m = OrderedDict(sorted(d.items(), key=lambda x: abs(x[1][11]-x[1][9])))
    #alternative sorting by cov_FnotQ/cov_QnotF
    #m = OrderedDict(sorted(d.items(), key=lambda x: x[1][9]/x[1][11] if x[1][11]>0 else x[1][9]))
    #alternative sorting by cov_F/cov_QnotF
    #m = OrderedDict(sorted(d.items(), key=lambda x: x[1][3]/x[1][11] if x[1][11]>0 else x[1][3]))
    
    idx = 0
    for k, i in reversed(m.items()): #take the features in reversed order (descending order)
      if idx < min(header_length, len(m)):
        if i[9]>0.1 or i[11]>0.1:
          tot = str(i[0])+' ('+str(round(i[1], 1))+')'
          F = str(i[2])+' ('+str(round(i[3], 1))+')'
          Q = str(i[4])+' ('+str(round(i[5], 1))+')'
          FandQ = str(i[6])+' ('+str(round(i[7], 1))+')'
          FnotQ = str(i[8])+' ('+str(round(i[9], 1))+')'
          QnotF = str(i[10])+' ('+str(round(i[11], 1))+')'
          I = str(i[12])+' ('+str(round(i[13], 1))+')'

          print("{:<12} {:<12} {:<12} {:<12} {:<12} {:<12} {:<12} {:<12}"
                    .format(k, tot, F, Q, FandQ, FnotQ, QnotF, I)) 
          idx+=1
      else:
        break

        
        
def calc_stats(data, stats_type, n, seq_length=1, max_height=15):
    """
    Calculates stats about dependencies in the requirements in dataset data
    @param data: the dataset (a DataFrame)
    @param stats_type: a value DEP_TYPE, BR_TYPE or SEQ_TYPE that determines the type of stats to calculate, 
    (respectively the dependencies in the requirement, the branches of dependencies and the sequences of POSdep)
    if none of the three before mentioned types is expressed, stats for ROOT VERBS are calculated
    @param n: the combinations of features to consider 
    (e.g., n=2 with stats_type=DEP_TYPE means that the stats will concern combinations of 2 dependencies in the req)
    @param seq_length: (optional) the maximum length of sequences of POSdep considered in case of stats_type=SEQ_TYPE
    @param max_height: (optional) the maximum height considered for the branches in case of stats_type=BR_TYPE
    @return: a tuple of dictionaries, each containing stats per feature for each type of req.
    Each element e in the tuple (e.g., e=dep_F) is a dictionary with key f (a feature) and two values: 
    the number of features f found in data for req type e and the percentage of requirement of type e containing at least once f 
    """
    
    # the following dictionaries will be populated with the stats
    dep = defaultdict(lambda: [0,0])
    dep_F = defaultdict(lambda: [0,0])
    dep_Q = defaultdict(lambda: [0,0])
    dep_FandQ = defaultdict(lambda: [0,0])
    dep_FnotQ = defaultdict(lambda: [0,0])
    dep_QnotF = defaultdict(lambda: [0,0])
    dep_I = defaultdict(lambda: [0,0])

    idx = 0
    for req in tqdm(data['RequirementText'], desc= str(n)+' '+str(stats_type)+' analysis', position=0):
#       print(req)
        doc = nlp(req.replace("'", "")) #use spacy to annotate the requirement
#       print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])

        req_counted = defaultdict(lambda: False)
    
        # 1. Create the list of features in the requirement, depending on the type stats_type
      
        #DEPENDENCY TYPES
        if stats_type == DEP_TYPE: 
            req_dep = []
            for t in doc:
                req_dep.append(t.dep_)
            dep_comb = list(itertools.combinations(req_dep, n))
            dep_comb.sort()

        #TYPES OF BRANCHES
        elif stats_type == BR_TYPE: 
            dep_br_lists = [get_all_paths(sent.root, 0, max_height) for sent in doc.sents]
            dep_br = []
            for l in dep_br_lists:
                if l!=['ROOT']:
                    dep_br = dep_br + l
            dep_br.sort()
            dep_comb = list(itertools.combinations(dep_br, n))
            
        #SEQUENCES OF POSdep
        elif stats_type == SEQ_TYPE: 
            req_dep = []
            for t in doc:
                req_dep.append(t.tag_+t.dep_)
            req_seq = []
            step = 1
            if n>1:
              step = seq_length
            for i in range(0, len(req_dep)-seq_length, step):
              s = ''
              for j in req_dep[i:i+seq_length]:
                s=s+("_" if s!='' else '')+j
              req_seq.append([s])

            req_seq.sort()
            dep_comb = list(itertools.combinations(req_seq, n))
            
        #ROOT VERBS
        else:         
          newr = req.replace('be able to', '').replace('be capable of', '').replace('provide the ability to', '')
          doc = nlp(newr)
          roots = []
          for t in doc:
            if t.dep_=='ROOT':
              roots.append(lemmatizer(t.orth_, t.pos_)[0])
          dep_comb = list(itertools.combinations(roots, n))
          dep_comb.sort()
          
        # 2. For each of the features created in step 1, if combinations of them are required, combine them
        for c in dep_comb:
            t = str(c[0])
            if(len(c)>1):
                for i in range(1,len(c)):
                    t = t+'+'+str(c[i])
            dep[t][0]+=1
            if not req_counted[t]:
                dep[t][1]+=1
                req_counted[t] = True

        # 3. Update the stats of the correct type of the requirement, for the features (or their combinations) of step 2
        F_counted = defaultdict(lambda: False)
        Q_counted = defaultdict(lambda: False)
        FnQ_counted = defaultdict(lambda: False)
        QnF_counted = defaultdict(lambda: False)
        FaQ_counted = defaultdict(lambda: False)
        I_counted = defaultdict(lambda: False)
        if data.at[idx, 'IsFunctional'] == 1: #F
            for c in dep_comb:
                t = str(c[0])
                if(len(c)>1):
                    for i in range(1,len(c)):
                        t = t+'+'+str(c[i])
                dep_F[t][0]+=1
                if not F_counted[t]:
                    dep_F[t][1]+=1
                    F_counted[t] = True

                if data.at[idx, 'IsQuality'] == 0: #OnlyF
                    dep_FnotQ[t][0]+=1
                    if not FnQ_counted[t]:
                        dep_FnotQ[t][1]+=1
                        FnQ_counted[t] = True
                else: #Q and FandQ
                    dep_FandQ[t][0]+=1
                    dep_Q[t][0]+=1
                    if not FaQ_counted[t]:
                        dep_FandQ[t][1]+=1
                        FaQ_counted[t] = True
                    if not Q_counted[t]:
                        dep_Q[t][1]+=1
                        Q_counted[t] = True
        else:
            for c in dep_comb:
                t = str(c[0])
                if(len(c)>1):
                    for i in range(1,len(c)):
                        t = t+'+'+str(c[i])
                if data.at[idx, 'IsQuality'] == 1: #OnlyQ
                    dep_QnotF[t][0]+=1
                    dep_Q[t][0]+=1
                    if not QnF_counted[t]:
                        dep_QnotF[t][1]+=1
                        QnF_counted[t] = True
                    if not Q_counted[t]:
                        dep_Q[t][1]+=1
                        Q_counted[t] = True
                else: #notR
                    dep_I[t][0]+=1
                    if not I_counted[t]: 
                        dep_I[t][1]+=1
                        I_counted[t] = True

        idx = idx + 1
    return dep, dep_F, dep_Q, dep_FandQ, dep_FnotQ, dep_QnotF, dep_I
          
    
def get_all_paths(node, h, max_h):
    """
    Calculates all the dependencies paths (branches) in a requirement dependency tree up to an height of max_h
    @param node: the root of the tree
    @param h: the initial height (typically 0)
    @return: a list of strings representing paths
    """
    if node.n_lefts + node.n_rights == 0 or h==max_h:
        return [node.dep_]
    return [
        node.dep_ + '_' + str(path) for child in node.children for path in get_all_paths(child, h+1, max_h)
    ]
  
  

def createDict(dep_tot, dep_F, dep_Q, dep_FandQ, dep_FnotQ, dep_QnotF, dep_I, 
               num_tot, num_F, num_Q, num_FandQ, num_FnotQ, num_QnotF, num_I):
  """
  Creates a dictionary containing all the stats used in the analysis
  @params: t dictionaries, one for each type of requirements, obtained from function calc_stats 
  and 7 integers, obtained from function count_req_types_frequencies
  @return: a dictionary
  """
  d = {}
  for k, v in dep_tot.items():
    d[k] = [v[0], v[1]/num_tot if num_tot>0 else 0,
            dep_F[k][0], dep_F[k][1]/num_F if num_F>0 else 0,
            dep_Q[k][0], dep_Q[k][1]/num_Q if num_Q>0 else 0,
            dep_FandQ[k][0], dep_FandQ[k][1]/num_FandQ if num_FandQ>0 else 0,
            dep_FnotQ[k][0], dep_FnotQ[k][1]/num_FnotQ if num_FnotQ>0 else 0,
            dep_QnotF[k][0], dep_QnotF[k][1]/num_QnotF if num_QnotF>0 else 0,
            dep_I[k][0], dep_I[k][1]/num_I if num_I>0 else 0]
  return d


def addDicts(x, y):
  """
  Add the values of two isomophic dictionaries
  """
  result = dict(x)
  for k, v in y.items():
    if k in result:
      for i in range(0, len(v)): 
        result[k][i] += v[i]
    else:
      result[k] = v
  return result

## 2. Stats Calculation

In [0]:
data_folder = './' #can be an url
dataset_names = ['promise-reclass', 'ds2', 'ds3', 'dronology', 'wasp', 'esa-eucl-est', 'leeds', 'reqview']

# read the datasets
datasets = [pd.read_csv(data_folder+dataset_name+'.csv', engine='python')[['RequirementText', 'IsFunctional', 'IsQuality']] for dataset_name in dataset_names] 

# # generates a new dataset combining all the others
# datasets_combined = pd.concat(datasets)
# datasets_combined = datasets_combined.reset_index(drop=True)
# # datasets_combined.to_csv(r'4ds_combined.csv')
# datasets = [datasets_combined]


#DEFINING THE EXPERIMENTS
stats_types = [DEP_TYPE, BR_TYPE, SEQ_TYPE, VERB_TYPE]
combinations = [1, 2, 3]
seq_lengths = [1,2,3,4]
VERBOSE = False #if True prints also the stats for the single datasets instead of only the final macro average
top_n_to_print = 10

for stat_type in stats_types:
  for combination in combinations:
    for seq_l in seq_lengths:
      print("\n== Experiment: "+stat_type+", "+str(combination)+" combinations, sequences long (if applies) "+str(seq_l)+" ==")
      d = {} #matrix for the macro average
      d.clear()
      idx = 0
      for data in datasets:
          print('\nDataset '+dataset_names[idx])
          idx+=1
          num_tot, num_F, num_Q, num_FandQ, num_FnotQ, num_QnotF, num_I = count_req_types_frequencies(data)
          dep_tot, dep_F, dep_Q, dep_FandQ, dep_FnotQ, dep_QnotF, dep_I = calc_stats(data, stat_type, combination, seq_l)
          data_d = createDict(dep_tot, dep_F, dep_Q, dep_FandQ, dep_FnotQ, dep_QnotF, dep_I, num_tot, num_F, num_Q, num_FandQ, num_FnotQ, num_QnotF, num_I)
          if VERBOSE:
            pprint_dep_stats(top_n_to_print, data_d)
          if not d:
            d = dict(data_d)
          else: 
            d = addDicts(d, data_d)
      for k, v in d.items(): #calc the macro average
        d[k] = [((v[j] / len(datasets)) if j%2!=0 else  v[j]) for j in range(0, len(v))]

      print("")
      print("All together with cov averaged over the datasets")
      pprint_dep_stats(top_n_to_print, d)
      
      if not stat_type=='Sequences': #do not continue the inner loop if the type does not contain sequences (avoid repeating twice same things)
        break

1 Dependencies analysis:   0%|          | 7/1502 [00:00<00:24, 60.60it/s]


== Experiment: Dependencies, 1 combinations, sequences long (if applies) 1 ==

Dataset 8combined


1 Dependencies analysis: 100%|██████████| 1502/1502 [00:29<00:00, 53.32it/s]


All together with cov averaged over the datasets
Feat         Tot (cov)    Tot F (cov)  Tot Q (cov)  FandQ (cov)  FnotQ (cov)  QnotF (cov)  I (cov)     
acl          466 (0.3)    365 (0.3)    169 (0.2)    72 (0.2)     293 (0.3)    97 (0.2)     4 (0.2)     
ccomp        195 (0.1)    140 (0.1)    97 (0.1)     42 (0.1)     98 (0.1)     55 (0.1)     0 (0.0)     
dobj         1862 (0.8)   1373 (0.9)   848 (0.7)    378 (0.9)    995 (0.9)    470 (0.6)    19 (0.8)    
advcl        325 (0.2)    235 (0.2)    156 (0.2)    68 (0.2)     167 (0.2)    88 (0.2)     2 (0.1)     
mark         199 (0.1)    140 (0.1)    90 (0.1)     32 (0.1)     108 (0.2)    58 (0.1)     1 (0.1)     
xcomp        434 (0.3)    282 (0.3)    235 (0.3)    84 (0.3)     198 (0.3)    151 (0.2)    1 (0.1)     
nsubj        1842 (0.9)   1187 (0.9)   973 (0.9)    338 (0.9)    849 (0.9)    635 (0.9)    20 (0.9)    
det          4236 (1.0)   2779 (1.0)   2156 (1.0)   737 (1.0)    2042 (1.0)   1419 (1.0)   38 (1.0)    
ROOT         1


