## Libs and Parameters

In [137]:
import numpy as np
import os
import torch
from sklearn.feature_extraction import stop_words
from importlib import reload

import IMDBDatum as imdb_data
import ngrams

from tqdm import tqdm_notebook as tqdm
from tqdm import tnrange
import pickle

#### Hyperparameters

In [135]:
LEARNING_RATE = 0.001
TRAINING_EPOCHS = 5
BATCH_SIZE = 32

NGRAM_SIZE = 2 # (1, 2, 3, 4)
VOC_SIZE = 10000 # takes top n word from the vocab
EMBEDDING_DIM = 100 # dimension size for the ngram embeddings
NGRAM_MODE = 'naive'

#### Other params

In [39]:
PAD_IDX = 0
data_dir = r'./data/aclImdb/'
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
TRAIN_SIZE = 20000
VALIDATION_SIZE = 5000
TEST_SIZE = 25000

### I. Data Loading

In [40]:
!tree -d

[01;34m.[00m
|-- [01;34m__pycache__[00m
`-- [01;34mdata[00m
    `-- [01;34maclImdb[00m
        |-- [01;34mtest[00m
        |   |-- [01;34mneg[00m
        |   `-- [01;34mpos[00m
        `-- [01;34mtrain[00m
            |-- [01;34mneg[00m
            |-- [01;34mpos[00m
            `-- [01;34munsup[00m

10 directories


In [41]:
# Load Dataset - should take less than 1 min
train_set = imdb_data.construct_dataset(train_dir, TRAIN_SIZE)
validation_set = imdb_data.construct_dataset(train_dir, VALIDATION_SIZE, offset=int(TRAIN_SIZE/2))
test_set = imdb_data.construct_dataset(test_dir, TEST_SIZE)

### Scratch - trying the ngrams code

In [43]:
dataset = train_set
n = NGRAM_SIZE
TRIAL_SIZE = 100

for i in tqdm(range(0, TRIAL_SIZE)):
    text_datum = dataset[i].raw_text
    ngrams, tokens = extract_ngram_from_text(text_datum, n)
    dataset[i].set_ngram(ngrams)
    dataset[i].set_tokens(tokens)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [140]:
import spacy
import string
from collections import Counter
from sklearn.feature_extraction import stop_words
from tqdm import tqdm_notebook as tqdm

tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

PAD_TOKEN, PAD_IDX = '<pad>', 0
UNK_TOKEN, UNK_IDX = '<unk>', 1


def tokenize(sent, remove_stopwords=True, remove_punc=True, mode='spacy'):
    """
    basic tokenizer method from spacy
    :param sent: input sentence
    :param remove_stopwords: whether to remove stopwords
    :param remove_punc: whether to remove punctuation
    :return: list of tokens
    """
    if mode == 'spacy':
        tokens = tokenizer(sent)
        tokens = [token.text for token in tokens]
        
    elif mode == 'naive':
        tokens = sent.split(" ")
        
    if remove_stopwords:  # only removed if small cap
        tokens = [token for token in tokens if token not in stop_words.ENGLISH_STOP_WORDS]
        
    if remove_punc:
        tokens = [token.lower() for token in tokens if (token not in punctuations)]
    else:
        tokens = [token.lower() for token in tokens]

    # returns lower case, scrubbed tokens
    return tokens


def extract_ngram_from_text(text, n, remove_stopwords=True, remove_punc=True, mode='spacy'):
    """
    Function that retrieves all n-grams from the input string
    @param text: raw string
    @param n: integer that tells the model to retrieve all k-gram where k<=n
    @param remove_stopwords: whether or not to remove stopwords from lib
    @param remove_punc: whether or not to remove punctuation from lib
    @return ngram_counter: a counter that maps n-gram to its frequency
    @return tokens: a list of parsed ngrams
    """
    tokens = tokenize(text, remove_stopwords=remove_stopwords, remove_punc=remove_punc, mode=mode)
    all_ngrams = []
    for i in range(0, len(tokens) - n):
        for j in range(1, n + 1):
            all_ngrams.append(get_n_gram_at_position_i(j, i, tokens))
    ngram_counter = Counter(all_ngrams)
    return ngram_counter, all_ngrams
    
    
def construct_ngram_indexer(ngram_counter_list, topk):
    """
    Function that selects the most common topk ngrams
    index 0 reserved for <pad>
    index 1 reserved for <unk>
    @param ngram_counter_list: list of counters
    @param topk, int: # of words to keep in the vocabulary - not counting pad/unk
    @return ngram2idx: a dictionary that maps ngram to an unique index
    """
    rt_dict = {PAD_TOKEN: PAD_IDX, UNK_TOKEN: UNK_IDX}
    i = 2  # the index to start the rest of the tokens
    final_count = Counter()

    for elem in tqdm(ngram_counter_list):
        for key, value in elem.items():
            final_count[key] += value

    for key in tqdm(dict(final_count.most_common(topk))):
        rt_dict[key] = i
        i += 1

    return rt_dict, final_count  # length topk + 2


def get_n_gram_at_position_i(n, i, tokens):
    """
    provided a list of tokens, gets the ngram starting at position i (0 indexed)
    :param n: ngram size
    :param i: ith position
    :param tokens: full list of tokens
    :return: tuple representing ngram
    """
    out_list = []
    if n == 1:
        return tokens[i]
    else:
        for j in range(i, i + n):
            out_list.append(tokens[j])
    return tuple(out_list)


def token_to_index(tokens, ngram_indexer):
    """
    Function that transform a list of tokens to a list of token index.
    index 0 reserved for <pad>
    index 1 reserved for <unk>
    @param tokens: list of ngram
    @param ngram_indexer: a dictionary that maps ngram to an unique index
    """
    return [ngram_indexer[token] if token in ngram_indexer else UNK_IDX for token in tokens]


def process_text_dataset(dataset, 
                         n, 
                         topk=None, 
                         ngram_indexer=None, 
                         remove_stopwords=True, 
                         remove_punc=True, 
                         mode='spacy'):
    """
    Top level function that encodes each datum into a list of ngram indices
    @param dataset: list of IMDBDatum
    @param n: n in "n-gram"
    @param topk: #
    @param ngram_indexer: a dictionary that maps ngram to an unique index
    """
    ngram_counter = None
    # extract n-gram
    print("extracting ngrams ...")
    for i in tnrange(len(dataset), desc='extract ngrams'):
        text_datum = dataset[i].raw_text
        ngrams, tokens = extract_ngram_from_text(text_datum, n, remove_stopwords, remove_punc, mode)
        dataset[i].set_ngram(ngrams)
        dataset[i].set_tokens(tokens)
    # select top k ngram
    if ngram_indexer is None:
        print("constructing ngram_indexer ...")
        ngram_indexer, ngram_counter = construct_ngram_indexer([datum.ngram for datum in dataset], topk)
    else:
        print("already have a passed ngram_indexer ...")
    # vectorize each datum
    print("setting each dataset's token indexes")
    for i in tnrange(len(dataset), desc='token to index'):
        dataset[i].set_token_idx(token_to_index(dataset[i].tokens, ngram_indexer))
    return dataset, ngram_indexer, ngram_counter

In [141]:
# reload(ngrams)
# Note that we are using the train_ngram_indexer to index validation and test dataset. Why? 
train_data, train_ngram_indexer, ngram_counter = process_text_dataset(train_set, NGRAM_SIZE, VOC_SIZE, 
                                                       mode=NGRAM_MODE)

validation_data, _, _ = process_text_dataset(validation_set, NGRAM_SIZE, ngram_indexer=train_ngram_indexer,
                                          mode=NGRAM_MODE)

test_data, _, _ = process_text_dataset(test_set, NGRAM_SIZE, ngram_indexer=train_ngram_indexer, 
                                    mode=NGRAM_MODE)

extracting ngrams ...


HBox(children=(IntProgress(value=0, description='extract ngrams', max=20000), HTML(value='')))

constructing ngram_indexer ...


HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

setting each dataset's token indexes


HBox(children=(IntProgress(value=0, description='token to index', max=20000), HTML(value='')))

extracting ngrams ...


HBox(children=(IntProgress(value=0, description='extract ngrams', max=5000), HTML(value='')))

already have a passed ngram_indexer ...
setting each dataset's token indexes


HBox(children=(IntProgress(value=0, description='token to index', max=5000), HTML(value='')))

extracting ngrams ...


HBox(children=(IntProgress(value=0, description='extract ngrams', max=25000), HTML(value='')))

already have a passed ngram_indexer ...
setting each dataset's token indexes


HBox(children=(IntProgress(value=0, description='token to index', max=25000), HTML(value='')))