# RE19-classification: reconstruction of Kurtanovic-Maalej

This notebook takes as input the technique presented by Kurtanovic and Maalej at RE'17 (data track), and reconstructs it on the Promise dataset.

## 0. Set up (optional)

Run the following  install functions if running Jupyter on a cloud environment like Colaboratory, which does not allow you to install the libraries permanently on your local machine

In [0]:
!pip install cython numpy
!pip install benepar[cpu]

## 1. Import libraries

In [0]:
# Basic numpy, sklearn, pandas libraries
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from IPython.display import display

# Basic NLTK tooling
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

# The benepar parser -- this is supposed to be a better parser than Stanford's parser used in the RE'17 paper
import benepar
benepar.download('benepar_en2')

# Tqdm, for progress bars -- useful to show that the parsing is working
from tqdm import tqdm

## 2. Load data

Imports the classified data set

In [0]:
# Loading the re-classified data set PROMISE
DATA_FOLDER =  './'
data = pd.read_csv(DATA_FOLDER+'promise-reclass.csv', engine='python')

print (data.head())

## 3. Dataset enrichment

Additional features are added automatically, as per the RE'17 paper.

In [0]:
# Text length
data['Length'] = 0
idx = 0
for x in data['RequirementText']:
    data.at[idx, 'Length'] = len(x)
    idx = idx + 1  

# POS tags and tree information
parser = benepar.Parser("benepar_en2")
data['Modal'] = 0.0
data['Adjective'] = 0.0
data['Noun'] = 0.0
data['Adverb'] = 0.0
data['Verb'] = 0.0
data['TreeHeight'] = 0
data['SubTrees'] = 0
idx = 0
for req in tqdm(data['RequirementText'], desc='Parse trees', position=0):
    tokens = tokenizer.tokenize(req)
    data.at[idx, 'Words'] = len(tokens)
    tags = nltk.pos_tag(tokens)
    fd = nltk.FreqDist(tag for (word, tag) in tags)
    for key, value in fd.items():
        if key=="MD":
            data.at[idx, 'Modal'] = value
        if key.startswith("JJ"):
            data.at[idx, 'Adjective'] = value
        if key.startswith("VB"):
            data.at[idx, 'Verb'] = value
        if key.startswith("NN"):
            data.at[idx, 'Noun'] = value
        if key=="RB":
            data.at[idx, 'Adverb'] = value
    data.at[idx, 'Modal'] = data.at[idx, 'Modal'] / len(tokens)
    data.at[idx, 'Adjective'] = data.at[idx, 'Adjective'] / len(tokens)
    data.at[idx, 'Noun'] = data.at[idx, 'Noun'] / len(tokens)
    data.at[idx, 'Adverb'] = data.at[idx, 'Adverb'] / len(tokens)
    data.at[idx, 'Verb'] = data.at[idx, 'Verb'] / len(tokens)       
    tree = parser.parse(req)
    data.at[idx, 'TreeHeight'] = tree.height()
    data.at[idx, 'SubTrees'] = len(tree)
    idx = idx + 1    
    
print(data[:30])

In [0]:
from nltk.util import ngrams
from collections import Counter
from sklearn.feature_extraction import stop_words
from nltk.stem import WordNetLemmatizer

bigrams = []
trigrams = []
frequencies = Counter([])
frequencies2 = Counter([])
frequencies3 = Counter([])
pfrequencies = Counter([])
pfrequencies2 = Counter([])
pfrequencies3 = Counter([])

wn_lemmatizer = WordNetLemmatizer()

# Generation of [1, 2, 3] textgrams, [1, 2, 3] POSgrams
# Fix with: tokenize, remove stopwords, lemmatize, then 
for req in tqdm(data['RequirementText'], desc='n-grams generation', position=0):
    token = tokenizer.tokenize(req)
    token = [word.lower() for word in token]
    tags = nltk.pos_tag(token)
    token = [w for w in token if not w in stop_words.ENGLISH_STOP_WORDS]
    token = [wn_lemmatizer.lemmatize(w) for w in token]
    frequencies += Counter(token)
    bigrams = ngrams(token,2)
    trigrams = ngrams(token,3)
    frequencies2 += Counter(bigrams)
    frequencies3 += Counter(trigrams)
    punigrams = [tag for (word, tag) in tags]
    pfrequencies += Counter(punigrams)
    pbigrams = ngrams([tag for (word, tag) in tags], 2)
    pfrequencies2 += Counter(pbigrams)
    ptrigrams = ngrams([tag for (word, tag) in tags], 3)
    pfrequencies3 += Counter(ptrigrams)

# Labeling of the features
for f in list(frequencies):
  label = '_' + f + '_'
  data[label] = 0

for f in list(frequencies2):
  label = '_' + f[0] + '_' + f[1] + '_'
  data[label] = 0

for f in list(frequencies3):
  label = '_' + f[0] + '_' + f[1] + '_' + f[2] + '_'
  data[label] = 0

for f in list(pfrequencies):
  label = f
  data[label] = 0
  
for f in list(pfrequencies2):
  label = f[0] + '_' + f[1]
  data[label] = 0

for f in list(pfrequencies3):
  label = f[0] + '_' + f[1] + '_' + f[2]
  data[label] = 0
  
print (len(frequencies), len(frequencies2), len(frequencies3), len(pfrequencies), len(pfrequencies2), len(pfrequencies3))

# Populating the n-grams
idx = 0
for req in tqdm(data['RequirementText'], desc='n-grams population', position=0):
    token = tokenizer.tokenize(req)

    for t in token:
      exists = [col for col in data.columns if col == str('_' + t + '_')]
      if exists != []:
        data.at[idx, exists] = 1
      
    bigrams = ngrams(token,2)
    for bg in bigrams:
      exists = [col for col in data.columns if col == str('_' + bg[0] + '_' + bg[1] + '_')]
      if exists != []:
        data.at[idx, exists] = 1
    
    trigrams = ngrams(token,3)
    for tg in trigrams:
      exists = [col for col in data.columns if col == str('_' + tg[0] + '_' + tg[1] + '_' + tg[2] + '_')]
      if exists != []:
        data.at[idx, exists] = 1
    
    tags = nltk.pos_tag(token)

    for t in tags:
      exists = [col for col in data.columns if col == str(t)]
      if exists != []:
        data.at[idx, exists] = 1
        
    pbigrams = ngrams([tag for (word, tag) in tags], 2)
    for bg in pbigrams:
      exists = [col for col in data.columns if col == str(bg[0] + '_' + bg[1])]
      if exists != []:
        data.at[idx, exists] = 1

    ptrigrams = ngrams([tag for (word, tag) in tags], 3)
    for tg in ptrigrams:
      exists = [col for col in data.columns if col == str(tg[0] + '_' + tg[1] + '_' + tg[2])]
      if exists != []:
        data.at[idx, exists] = 1
    
    idx = idx + 1

data.columns = data.columns.map(str)

print (data.head())

# The enriched dataset is now saved
data.to_csv('dataset-full.csv', encoding='utf-8')

## 4. Feature reduction

We reduce the dimensionality of the data. Change the *target*  parameter in the second cell to determine whether you want to train a classifier for F, Q, only F, or only Q requirements.


In [0]:
# Creation of an ensemble that uses adaptive boost, gradient boos, extra trees, and random forest
def createTop (nfeatures, data, X_train, y_train, target):
  #nfeatures = 100

  ada_boost_clf = AdaBoostClassifier(random_state=42, n_estimators=30)
  ada_boost_clf.fit(X_train, y_train)

  gradient_boost_clf = GradientBoostingClassifier(random_state=42, n_estimators=30, max_depth = 5)
  gradient_boost_clf.fit(X_train, y_train)

  extra_trees_clf = ExtraTreesClassifier(random_state=42, n_estimators=30, max_depth = 5)
  extra_trees_clf.fit(X_train, y_train)

  random_forest_clf = RandomForestClassifier(random_state=42, n_estimators=30, max_depth = 5)
  random_forest_clf.fit(X_train, y_train)

  # Sorting in order of importance: average importance
  importances = ada_boost_clf.feature_importances_  + gradient_boost_clf.feature_importances_ + extra_trees_clf.feature_importances_ + random_forest_clf.feature_importances_
  indices = np.argsort(importances)[::-1]

  # Print the feature ranking
  print("Feature ranking:")

  tokeep = []
  for f in range(0, nfeatures):
      print("%d. feature %s (%f)" % (f + 1, X_train.columns[indices[f]], importances[indices[f]]))
      tokeep.append(X_train.columns[indices[f]])

  tokeep.append('RequirementText')
  tokeep.append('ProjectID')
  tokeep.append('Class')
  if target=='OnlyQuality':
    tokeep.append('OnlyQuality')
    tokeep.append('IsFunctional')
    appendix = 'oq'
  elif target=='OnlyFunctional':
    tokeep.append('IsQuality')
    tokeep.append('OnlyFunctional')
    appendix = 'of'
  elif target=='IsQuality' or target=='IsFunctional':
    tokeep.append('IsQuality')
    tokeep.append('IsFunctional')
    if target=='IsQuality':
      appendix = 'q'
    else:
      appendix = 'f'

  data3 = data[tokeep]

  print (data3.head())
  data3.to_csv('promise-km-' + str(nfeatures) + '-' + appendix + '.csv', encoding='utf-8')

In [0]:
# Set the target: choose between IsFunctional, IsQuality, OnlyFunctional, OnlyQuality
target = 'OnlyQuality'

data = pd.read_csv('dataset-full.csv', engine='python')
datarep = data.drop(data.columns[0], axis=1)

if target=='OnlyQuality':
  datarep['OnlyQuality'] = ~datarep['IsFunctional'] & datarep['IsQuality']
  todrop = ['RequirementText', 'Class', 'ProjectID', 'IsFunctional', 'IsQuality']

if target=='OnlyFunctional':
  datarep['OnlyFunctional'] = datarep['IsFunctional'] & ~datarep['IsQuality']
  todrop = ['RequirementText', 'Class', 'ProjectID', 'IsFunctional', 'IsQuality']

if target=='IsQuality':
  todrop = ['RequirementText', 'Class', 'ProjectID', 'IsFunctional']

if target=='IsFunctional':
  todrop = ['RequirementText', 'Class', 'ProjectID', 'IsQuality']


# Remove the features that are not used for the classification
data2 = datarep.drop(todrop, axis = 1)

# Create training and testing set
# === BEGIN REMOVED AFTER CONDITIONAL ACCEPT
# X_train, X_test, y_train, y_test = train_test_split(
#      data2.drop([target], axis=1), data2[target], test_size=0.25, random_state=42)
# print (X_train.columns)
# === END REMOVED AFTER CONDITIONAL ACCEPT


# === BEGIN REMOVED AFTER CONDITIONAL ACCEPT
# createTop (500, datarep, X_train, y_train, target)
# === END REMOVED AFTER CONDITIONAL ACCEPT

# === BEGIN ADDED AFTER CONDITIONAL ACCEPT
createTop (100, datarep, data2.drop([target], axis=1), data2[target], target)
# === BEGIN REMOVED  AFTER CONDITIONAL ACCEPT
