# Text Mining
Text mining is used to obtain the _top k words_ appearing in the dataset that have been determined to be the most informative. These _top k words_ are used by the rule learner to generate rule sets utilized for predication. The text mining phase is conducted in the following order:

#### Data Extraction
Load all datasets to be fed into pre-processing

#### 1. Pre-Processing
* **1.1** Tokenization
* **1.2** Stop word removal
* **1.3** Stemming

#### 2. Feature Selection
* **2.1** Tf\*Idf
* **2.2** Info-Gain

#### 3. Resulting Set

| Id  | Word | Weight | Severity |
| --- | ---- | ------ | -------- |
|   1 | pred | 0.5551 |        3 |

In [94]:
import numpy as np
import pandas as pd
from pprint import pprint

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /home/jude/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jude/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading Datasets
* load csv file
* extract needed columns

In [69]:
def load_data(path):
    df = pd.read_csv(path, sep=',', encoding='ISO-8859-1')
    raw_data = np.array(df)
    
    # get the columns for Subject and Severity Rating
    extract_cols = [1, 2]
    del_cols = np.delete(np.arange(raw_data.shape[1]), extract_cols)
    data = np.delete(raw_data, del_cols, axis=1)
    
    return data

In [70]:
print(load_data('../dataset/raw/pitsA.csv'))

[['Build 5.3: Unitialized Variables' 3]
 ['Build 5.3 FSW: Typecast Mismatch in Memory Deallocation' 3]
 ['Build 5.3 FSW: Parameter Type Mismatch' 3]
 ...
 ['The SIS and SDD are not listed within Table 1-9 of the SDP' 4]
 ['Incorrect and Incomplete traceability between the PSMP and the SDR' 5]
 ['Incorrect and Incomplete traceability between the PSMP and the SDR' 5]]


# 1. Pre-Processing

## 1.1 Tokenization
The steps outlined in the paper:
1. All punctuation are replaced with spaces
2. remove non-printable characters
3. convert all letters to lowercase

In [20]:
def tokenize(data):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    
    # remove non-printable characters (ASCII ONLY)
    
    
    # convert all letters to lowercase
    lowercase_string = data.lower()
    
    # replace all punctuation with spaces
    remove_punctuation = ''
    for char in lowercase_string:
        if char in punctuations:
            remove_punctuation += ' '
        else:
            remove_punctuation += char
    
    # return word tokens
    return nltk.word_tokenize(remove_punctuation)

In [21]:
print(tokenize('Build 5.3: Unitialized Variables'))

['build', '5', '3', 'unitialized', 'variables']


## 1.2 Stop Word Removal
In the original paper, the number of stop words used is 262. The number of english stop words included in the nltk library is 179.

In [22]:
def remove_stopwords(tokens):
    eng_stopwords = stopwords.words('english')
    
    result = []
    for token in tokens:
        if token not in eng_stopwords:
            result.append(token)
    return result

In [23]:
print(remove_stopwords(tokenize('L3-SFS-887 incomplete traceability to the L4 requirements')))

['l3', 'sfs', '887', 'incomplete', 'traceability', 'l4', 'requirements']


## 1.3 Stemming
Porter stemmer was used.

In [24]:
def stem_words(no_stop_tokens):
    porter_stemmer = PorterStemmer()
    
    result = []
    for token in no_stop_tokens:
        result.append(porter_stemmer.stem(token))
    return result

In [25]:
print(stem_words(remove_stopwords(tokenize('L3-SFS-887 incomplete traceability to the L4 requirements'))))

['l3', 'sf', '887', 'incomplet', 'traceabl', 'l4', 'requir']


# 2. Feature Selection

## 2.1 Tf\*Idf
The orginal paper has experimented with different values for the _top k_ words and decided on k=100.

In [57]:
# consider making top_k into a parameter
def apply_tfidf(documents):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(documents)
    
    # get array of indices sorted by the tfidf
    indices = np.argsort(vectorizer.idf_)
    features = vectorizer.get_feature_names()
    
    # create a new list of the features based on sorted indices 
    top_k = 100
    top_features = [features[i] for i in indices[:top_k]]
    
    return top_features

In [58]:
#apply_tfidf(stem_words(remove_stopwords(tokenize('L3-SFS-887 incomplete traceability to the L4 requirements'))))

# load dataset from csv
pitsA = load_data('../dataset/raw/pitsA.csv')

# tokenize, stem and concat to string for each summary
documents = []
for report in pitsA:
    tokens = tokenize(report[0])
    no_stopwords = remove_stopwords(tokens)
    stemmed = stem_words(no_stopwords)
    
    seperator = ' '
    documents.append(seperator.join(stemmed))

# get top k=100 words based on tf*idf
top_k = apply_tfidf(documents)
print(top_k)


['requir', 'sr', 'test', 'l3', 'obc', 'engcntrl', 'sf', 'script', 'miss', 'build', 'code', 'rvm', 'l4', 'dh', 'inconsist', 'projecta', 'link', 'l2', 'ac', 'rqt', 'incomplet', 'trace', 'traceabl', 'sc', 'fsw', 'v1', 'issu', 'cm', 'fsrd', 'analysi', 'fpr', 'address', 'adequ', 'text', 'procedur', 'initi', 'verifi', 'vml', 'incorrect', 'softwar', 'rta', 'unclear', 'srup', 'satisfi', 'intent', 'uplink', 'prd', 'fulli', 'b5', 'mode', 'instanc', 'data', 'variabl', 'function', 'bound', 'control', 'command', 'defin', 'array', 'vm', 'flight', 'ta', 'symbol', 'use', 'possibl', 'pointer', 'exist', 'non', 'rm', 'associ', '11', 'fgi', 'v0', 'req', 'access', 'surom', 'case', 'unnecessari', 'fs', 'definit', 'default', 'line', 'may', 'attitud', 'cast', 'flow', '12', '04', 'unusu', 'within', 'return', 'switch', 'specif', 'memori', 'icd', 'referenc', 'fp', 'design', '10', 'macro']


## 2.2 Info-Gain
Rerank _top k_ words.

Defination of mutual information methods used from sklearn appear to be the same as info gain and has been used here.

#### Method
1. create feature matrix and target vector
2. generate information gain weights for features in dataset
3. map weight of features to list of top k words
4. sort top k words by weights

In [95]:
def rank_top_k(documents, top_k):
    # create feature matrix and target vector
    # X = text subject
    # y = subject severity
    X, y = [], []
    for document in documents:
        X.append(document[0])
        y.append(document[1])
    
    vectorizer = CountVectorizer()
    X_vec = vectorizer.fit_transform(X)
    
    # store features with info gain weight
    res = dict(zip(vectorizer.get_feature_names(),
               mutual_info_classif(X_vec, y, discrete_features=True)
               ))
    #print(res)
    
    # map weights to top k words
    weights = []
    for k in top_k:
        weights.append(res.get(k))
    #print(weights)
    
    # sort by weight
    indices = np.argsort(weights)
    ranked_features = [top_k[i] for i in indices]
    
    return ranked_features

# Test Implementation

In [96]:
#apply_tfidf(stem_words(remove_stopwords(tokenize('L3-SFS-887 incomplete traceability to the L4 requirements'))))

# load dataset from csv
pitsA = load_data('../dataset/raw/pitsA.csv')

# tokenize, stem and concat to string for each summary
documents = []
for report in pitsA:
    subject = report[0]
    severity = report[1]
    
    tokens = tokenize(report[0])
    no_stopwords = remove_stopwords(tokens)
    stemmed = stem_words(no_stopwords)
    
    seperator = ' '
    document = [seperator.join(stemmed), severity]
    documents.append(document)

# get top k=100 words based on tf*idf
subjects = []
for report in documents:
    subjects.append(report[0])
top_k = apply_tfidf(subjects)
#print(top_k)

# reorder top k words with info-gain
ranked_top_k = rank_top_k(documents, top_k)

# print ranked top k words
pprint(ranked_top_k)


['default',
 'switch',
 'possibl',
 'control',
 'fp',
 'design',
 'flight',
 'flow',
 'may',
 'case',
 'attitud',
 'mode',
 'return',
 'uplink',
 'address',
 'adequ',
 'line',
 'procedur',
 'specif',
 '11',
 'fpr',
 'vm',
 'initi',
 'definit',
 'associ',
 'bound',
 'fulli',
 'memori',
 'command',
 'within',
 'pointer',
 'softwar',
 'access',
 'icd',
 'b5',
 'array',
 'use',
 'symbol',
 '12',
 '10',
 'function',
 'non',
 'fgi',
 'variabl',
 'incorrect',
 'ta',
 'fs',
 'inconsist',
 'macro',
 'unusu',
 'defin',
 'prd',
 'data',
 'cast',
 'verifi',
 'srup',
 'v0',
 'unclear',
 'req',
 '04',
 'unnecessari',
 'referenc',
 'intent',
 'l2',
 'exist',
 'instanc',
 'analysi',
 'surom',
 'vml',
 'rm',
 'satisfi',
 'code',
 'link',
 'fsw',
 'rta',
 'build',
 'incomplet',
 'dh',
 'v1',
 'trace',
 'obc',
 'ac',
 'projecta',
 'fsrd',
 'l4',
 'text',
 'traceabl',
 'sc',
 'sf',
 'issu',
 'cm',
 'sr',
 'l3',
 'miss',
 'rqt',
 'requir',
 'test',
 'engcntrl',
 'script',
 'rvm']
