In [1]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

import nltk.data
#nltk.download()
#nltk.download('punkt')
from nltk.corpus import stopwords
#!pip install gensim
from gensim.models.word2vec import Word2Vec

def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'nfr.csv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('data_nfr', datasets[name])
    df = pd.read_csv(data_file, sep=',', escapechar='\\', nrows=nrows,encoding = "ISO-8859-1")
    print('Number of requirements: {}'.format(len(df)))
    return df

'''
First load in the unlabeled dataset.
This will be used for training the word vectors.
'''
df = load_dataset('unlabeled_train')
print(df.head())

'''
Next, clean the text, similar to part 1 and divide into sentences
However, this time do not remove stopwords. Sentences are split using the nltk punkt tokenizer. 
The result is a list of sentences obtained from all the requirements combined, and each sentence is 
a list of cleaned words (still including stopwords).
'''


# In[11]:


eng_stopwords = set(stopwords.words('english'))

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def print_call_counts(f):
    n = 0
    def wrapped(*args, **kwargs):
        nonlocal n
        n += 1
        if n % 1000 == 1:
            print('method {} called {} times'.format(f.__name__, n))
        return f(*args, **kwargs)
    return wrapped

@print_call_counts
def split_sentences(req):
    raw_sentences = tokenizer.tokenize(req.strip())
    sentences = [clean_text(s) for s in raw_sentences if s]
    return sentences

%time sentences = sum(df.RequirementText.apply(split_sentences), [])
print('{} reviews -> {} sentences'.format(len(df), len(sentences)))

#get_ipython().magic('time sentences = sum(df.review.apply(split_sentences), [])')
#print('{} reviews -> {} sentences'.format(len(df), len(sentences)))


'''
Here we will train the word vector model
Default logging setup and parameters taken from the tutorial.
'''

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality
min_word_count = 10   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)

print('Training model...')
model = Word2Vec(sentences, workers=num_workers,
	size=num_features, min_count = min_word_count,
	window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model.save(os.path.join('models_nfr', model_name))

'''
Some examples of how the model works
'''


# In[20]:


#print(model.doesnt_match("pool load secure heartbeat".split()))
#print(model.doesnt_match('secure session authorize authenticate'.split()))


# In[21]:


#model.most_similar("pooling")


# In[22]:


#model.most_similar("authenticate")


# In[23]:


#model.most_similar("load")


Using TensorFlow backend.


Number of requirements: 625
   ProjectID                                    RequirementText class
0          1  'The system shall refresh the display every 60...    PE
1          1  'The application shall match the color of the ...    LF
2          1  ' If projected  the data must be readable.  On...    US
3          1  ' The product shall be available during normal...     A
4          1  ' If projected  the data must be understandabl...    US
method split_sentences called 1 times


2017-09-19 00:36:39,035 : INFO : collecting all words and their counts
2017-09-19 00:36:39,038 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-19 00:36:39,044 : INFO : collected 1624 word types from a corpus of 12152 raw words and 771 sentences
2017-09-19 00:36:39,048 : INFO : Loading a fresh vocabulary
2017-09-19 00:36:39,054 : INFO : min_count=10 retains 191 unique words (11% of original 1624, drops 1433)
2017-09-19 00:36:39,059 : INFO : min_count=10 leaves 8779 word corpus (72% of original 12152, drops 3373)
2017-09-19 00:36:39,060 : INFO : deleting the raw counts dictionary of 1624 items
2017-09-19 00:36:39,063 : INFO : sample=0.001 downsamples 73 most-common words
2017-09-19 00:36:39,064 : INFO : downsampling leaves estimated 4138 word corpus (47.1% of prior 8779)
2017-09-19 00:36:39,064 : INFO : estimated required memory for 191 words and 300 dimensions: 553900 bytes
2017-09-19 00:36:39,066 : INFO : resetting layer weights
2017-09-19 00:36:39,07

CPU times: user 130 ms, sys: 10 ms, total: 140 ms
Wall time: 144 ms
625 reviews -> 771 sentences
Training model...


'\nSome examples of how the model works\n'