### Preprocessor
#### The preprocessor will take in raw text as input and transform it in the following in order: 
1. remove punctuation, 2. remove numbers 3. lower case, 4. tokenization, 5. lemmatization, 6. remove extra white space.
The preprocessor class when instantiated with the input (only takes one line of code) should return the transformed output as word tokenized and lemmatized string.

In [3]:
import nltk
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from string import punctuation
from collections import Counter

In [4]:
ps = PorterStemmer()
numbers = [str(n) for n in range(0,10)]

In [24]:
# a function that takes in raw text and retuns tokenized and lemmatized list of words after applying a number preprocessing
# steps mentioned in paper: 
def preprocess(text: str) -> str:
    """
    Preprocessing with the following:
    1. remove punctuation, 2. remove numbers 3. lower case, 4. tokenization, 5. lemmatization, 6. remove extra white space.
    : return: a list of tokenized and lemmatized words.
    """
    text = ''.join([c for c in text if c not in punctuation]) # remove punctuation
    text = ''.join([c for c in text if c not in numbers]) # remove numbers
    preprocessed_text = text.lower() # lower case the text.
    
    return preprocessed_text



def tokenize_lemmatize(text: str) -> list:
    """
    :Tokenize text into list of words.
    :Lemmatize these words to standardise them into their roots.
    :Remove any spaces within a stored word lemma for standardisation.
    Return: a standardised, lemmatized list of tokenized words. 
    """
    
    tokenized_text = word_tokenize(text) # returns a tokenized list of words from text.
    tokenized_text = [ps.stem(word) for word in tokenized_text] # returns a lemmatization of the tokenized text (effectively reducing vocab)
    tokenized_text = [word.replace(' ', '') for word in tokenized_text]  
    return tokenized_text



def create_lookup_tables(words: iter) -> tuple:
    """
    Create lookup words for vocabulary.
    :the 'words' argument or parameter: takes in a list of words
    Return: Three dictionaries, vocab_int and int_vocab
    """
    word_count = Counter(words)
    # sorting the frequency of words from highest to lowest in occurrence.
    sorted_word_count = sorted(word_count, key=word_count.get, reverse=True)
    # creating dicts that has key-value pairs of word-count and count-word.
    vocab_int = {word: (ii+1) for ii, word in enumerate(sorted_word_count)}
    int_vocab = {(ii+1): word for ii, word in enumerate(sorted_word_count)}
    
    # add the unknown word to dict
    vocab_int['<unk>'] = 0
    int_vocab[0] = '<unk>'
    
    return word_count, vocab_int, int_vocab


def word_to_int(input_text: iter, vocab_to_int: dict) -> iter:
    """
    A function to be used for encoding text to integers for prediction (assigning unknown words).
    Return: list of integers representing words in text
    """
    standardised_text = tokenize_lemmatize(input_text)
    # Convert words not in lookup to '<unk>'.
    for ii, word in enumerate(standardised_text):
        if word not in list(vocab_to_int.keys()):
            standardised_list[ii] = '<unk>'
        # assign the text integer values.
    word_ints = [vocab_to_int[word] for word in standardised_text]
    
    return word_ints

def pad_features(reviews_ints: iter, seq_length: int):
    ''' 
    : Take in a list of words encoded as integers, list length parametarised by 'seq_length',
    then return them as input numpy array features for the model. 
    Return: features of review_ints, where each review is padded with 0's 
    or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features