In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/adara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/adara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/adara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
req_text = pd.read_csv('./datasets/domain_masked.csv')

In [4]:
req_text['text'] = req_text['text'].str.replace(r'\\r',' ', regex=True)
req_text['text'] = req_text['text'].str.replace(r'\\n',' ', regex=True)
req_text['text'] = req_text['text'].str.replace(r'\\t',' ', regex=True)
req_text['text'] = req_text['text'].str.replace(r'    ',' ', regex=True)
req_text['text'] = req_text['text'].str.replace(r'""','', regex=True)

In [5]:
req_text['text'] = req_text['text'].str.lower()

In [6]:
punctuation_signs = list("?:!.,;`'")

for punct_sign in punctuation_signs:
    req_text['text'] = req_text['text'].str.replace(punct_sign, '')

In [7]:
req_text['text'] = req_text['text'].str.replace("'s'","", regex=True)

In [8]:
wordnet_lemmatizer = WordNetLemmatizer()
nrows = len(req_text)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = req_text.loc[row]['text']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

In [9]:
req_text['text'] = lemmatized_text_list

In [10]:
stop_words = list(stopwords.words('english'))

In [11]:
for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    req_text['text'] = req_text['text'].str.replace(regex_stopword, '')

In [12]:
le = LabelEncoder()
le.fit(req_text['label'])
le.classes_

array(['AVAILABILITY', 'FAULT TOLERANCE', 'MAINTAINABILITY',
       'PERFORMANCE', 'SCALABILITY', 'SECURITY', 'USABILITY'],
      dtype=object)

In [13]:
X = req_text['text']
y = le.transform(req_text['label'])

In [14]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [15]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X).toarray()
labels_train = y
print(features_train.shape)

(630, 148)


In [17]:
from tabulate import tabulate
from IPython.display import Markdown, display

features = []

for label in range(len(le.classes_)):
    features_chi2 = chi2(features_train, labels_train == label)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    features.append(["**{}**".format(le.inverse_transform([label])[0]), \
                     "`{}`".format('`, `'.join(unigrams[-5:])), \
                     "`{}`".format('`, `'.join(bigrams[-2:]))])

In [18]:
data = pd.DataFrame(features)
display(Markdown(tabulate(data, headers=['Quality Attribute', 'Unigrams', 'Bigrams'], tablefmt="github", numalign="right")))

|    | Quality Attribute   | Unigrams                                                   | Bigrams                          |
|----|---------------------|------------------------------------------------------------|----------------------------------|
|  0 | **AVAILABILITY**    | `failure`, `achieve`, `hours`, `availability`, `available` | `system must`, `shall available` |
|  1 | **FAULT TOLERANCE** | `eg`, `control`, `result`, `failure`, `operate`            | `within system`, `system shall`  |
|  2 | **MAINTAINABILITY** | `maintain`, `design`, `new`, `update`, `maintenance`       | `use system`, `user able`        |
|  3 | **PERFORMANCE**     | `status`, `result`, `less`, `response`, `fast`             | `less fast`, `response time`     |
|  4 | **SCALABILITY**     | `manner`, `capable`, `support`, `handle`, `number`         | `shall support`, `shall capable` |
|  5 | **SECURITY**        | `authorize`, `password`, `security`, `encrypt`, `access`   | `user system`, `authorize user`  |
|  6 | **USABILITY**       | `use`, `content`, `navigation`, `easy`, `page`             | `shall easy`, `use system`       |

In [18]:
import pickle

In [19]:
X_train, X_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.15, random_state=7)

In [20]:
with open('X_train.pickle', 'wb') as file:
    pickle.dump(X_train, file)

with open('y_train.pickle', 'wb') as file:
    pickle.dump(y_train, file)

with open('X_test.pickle', 'wb') as file:
    pickle.dump(X_test, file)

with open('y_test.pickle', 'wb') as file:
    pickle.dump(y_test, file)
    
with open('features_train.pickle', 'wb') as file:
    pickle.dump(features_train, file)

with open('labels_train.pickle', 'wb') as file:
    pickle.dump(labels_train, file)

with open('label_encoder.pickle', 'wb') as file:
    pickle.dump(le, file)

In [21]:
bigrams

['system shall',
 'must able',
 'user access',
 'system must',
 'shall available',
 'shall able',
 'authorize user',
 'user able',
 'user user',
 'user shall',
 'shall easy',
 'use system']