# Pre-Procescing

In [46]:
import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /home/jude/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jude/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tokenization
The steps outlined in the paper:
1. All punctuation are replaced with spaces
2. remove non-printable characters
3. convert all letters to lowercase

In [45]:
def tokenize(data):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    
    # remove non-printable characters (ASCII ONLY)
    
    
    # convert all letters to lowercase
    lowercase_string = data.lower()
    
    # replace all punctuation with spaces
    remove_punctuation = ''
    for char in lowercase_string:
        if char in punctuations:
            remove_punctuation += ' '
        else:
            remove_punctuation += char
    
    # return word tokens
    return nltk.word_tokenize(remove_punctuation)

In [14]:
print(tokenize('Build 5.3: Unitialized Variables'))

build 5.3: unitialized variables
build 5 3  unitialized variables
['build', '5', '3', 'unitialized', 'variables']


## Stop Word Removal
In the original paper, the number of stop words used is 262. The number of english stop words included in the nltk library is 179.

In [40]:
def remove_stopwords(tokens):
    eng_stopwords = stopwords.words('english')
    
    result = []
    for token in tokens:
        if token not in eng_stopwords:
            result.append(token)
    return result

In [41]:
print(remove_stopwords(tokenize('L3-SFS-887 incomplete traceability to the L4 requirements')))

l3-sfs-887 incomplete traceability to the l4 requirements
l3 sfs 887 incomplete traceability to the l4 requirements
['l3', 'sfs', '887', 'incomplete', 'traceability', 'l4', 'requirements']


## Stemming
Porter stemmer was used.

In [43]:
def stem_words(no_stop_tokens):
    porter_stemmer = PorterStemmer()
    
    result = []
    for token in no_stop_tokens:
        result.append(porter_stemmer.stem(token))
    return result

In [44]:
print(stem_words(remove_stopwords(tokenize('L3-SFS-887 incomplete traceability to the L4 requirements'))))

l3-sfs-887 incomplete traceability to the l4 requirements
l3 sfs 887 incomplete traceability to the l4 requirements
['l3', 'sf', '887', 'incomplet', 'traceabl', 'l4', 'requir']


## Tf*Idf
Need to come back to this. First complete loading the data and having a list of documents (feature-summary)

In [53]:
def apply_tfidf(documents):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(documents)
    print(vectorizer.get_feature_names())

In [54]:
apply_tfidf(stem_words(remove_stopwords(tokenize('L3-SFS-887 incomplete traceability to the L4 requirements'))))

['887', 'incomplet', 'l3', 'l4', 'requir', 'sf', 'traceabl']


# Loading Data