In [None]:
#DS-GA 1011 HW1
#Author: Chris Rogers
#Code adapted from DS-GA 1011 Lab 3


#load all of the reviews
import os
import random
import nltk
from collections import Counter
import numpy as np
import string
import pickle

#start out the same very time 
random.seed(12345)

#should we run each function as we define it, mostly used for testing initially
run_inline = False

#assumes the aclImdb directory is present locally
root_dir = "aclImdb"
train_dir = "train"
test_dir = "test"
pos_dir = "pos"
neg_dir = "neg"

#split our "train" set 20000/5000 with validation
train_split = 20000

scheduler = None

def loadDirectory(reviewList, directory):

    for file in os.listdir(directory):
        with (open(os.path.join(directory, file), "r", encoding="utf-8")) as review_file:
            reviewList.append(review_file.read())

            
train_data = []
loadDirectory(train_data,os.path.join(root_dir, train_dir, pos_dir))
train_target_data = [1]*len(train_data)
loadDirectory(train_data,os.path.join(root_dir, train_dir, neg_dir))
train_target_data += ([0]*(len(train_data) - len(train_target_data)))

test_data = []
loadDirectory(test_data,os.path.join(root_dir, test_dir, pos_dir))
test_target_data = [1]*len(test_data)
loadDirectory(test_data,os.path.join(root_dir, test_dir, neg_dir))
test_target_data += ([0]*(len(test_data) - len(test_target_data)))
    
combined = list(zip(train_data, train_target_data ))
random.shuffle(combined)
train_data[:], train_target_data[:] = zip(*combined)

combined = list(zip(test_data, test_target_data ))
random.shuffle(combined)
test_data[:], test_target_data[:] = zip(*combined)


val_reviews = train_data[train_split:]
train_reviews = train_data[:train_split]


val_targets = np.array(train_target_data[train_split:])
train_targets = np.array(train_target_data[:train_split])


#a bit redundant, but was less confusing to keep the naming the same above
test_reviews = test_data
test_targets = np.array(test_target_data)
    
print("train reviews:", len(train_reviews))
print("train targets:", len(train_targets))
print("val reviews:", len(val_reviews))
print("val targets:", len(val_targets))
print("test reviews:", len(test_reviews)) 
print("test targets:", len(test_targets)) 

            

In [None]:

#create a tokenized version of a string, with the specified pre-processing and n-grams
def tokenize(string_list, removePunct = True, removeNumbers=False, replaceNumbers=True, toLower=True, ngrams=1):
    
    remove_digits = str.maketrans('', '', string.digits)
    remove_punct = str.maketrans('', '', string.punctuation)
    token_list = []
    
    for string_val in string_list:
        if (toLower) :
            string_val = string_val.lower()
            
        if (removeNumbers) :
            string_val = string_val.translate(remove_digits)
        
        if (removePunct) :
            string_val = string_val.translate(remove_punct)
        
        tokens = nltk.word_tokenize(string_val)
        
        if (replaceNumbers):
            tokens = [ token if not token.isdigit() else 'NMBR' for token in tokens]
            
        full_tokens = tokens
        while (ngrams > 1):
            full_tokens += list(nltk.ngrams(tokens, ngrams))
            ngrams -= 1
            
        token_list.append(full_tokens)
        
    
    return token_list


In [None]:
#for the given set of pre-processing, go through and tokenize our three datasets
def tokenize_data(td_replaceNumbers=True, td_removeNumbers = False, td_toLower=True, td_removePunct=True, td_ngrams=1):
    train_tokens = tokenize(train_reviews, replaceNumbers=td_replaceNumbers, toLower=td_toLower, 
                            removePunct=td_removePunct, ngrams=td_ngrams, removeNumbers=td_removeNumbers)
    val_tokens = tokenize(val_reviews, replaceNumbers=td_replaceNumbers, toLower=td_toLower, 
                            removePunct=td_removePunct, ngrams=td_ngrams,  removeNumbers=td_removeNumbers)
    test_tokens = tokenize(test_reviews, replaceNumbers=td_replaceNumbers, toLower=td_toLower, 
                            removePunct=td_removePunct, ngrams=td_ngrams,  removeNumbers=td_removeNumbers)

    all_train_tokens = []

    for toklist in train_tokens:
        all_train_tokens.extend(toklist)
        
    return train_tokens, val_tokens, test_tokens, all_train_tokens

if (run_inline):
    train_tokens, val_tokens, test_tokens, all_train_tokens = tokenize_data()
    #taking a look, 
    token_counter = Counter(all_train_tokens)
    token_counter.most_common(10)


In [None]:
#Here we'll use our max vocab size to build an indexed vocabulary

# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size = 30000):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

if (run_inline):
    token2id, id2token = build_vocab(all_train_tokens)

    # Lets check the dictionary by loading random token from it

    random_token_id = random.randint(0, len(id2token)-1)
    random_token = id2token[random_token_id]

    print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
    print ("Token {}; token id {}".format(random_token, token2id[random_token]))

In [None]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

if (run_inline):
    train_data_indices = token2index_dataset(train_tokens)
    val_data_indices = token2index_dataset(val_tokens)
    test_data_indices = token2index_dataset(test_tokens)

    # double checking
    print ("Train dataset size is {}".format(len(train_data_indices)))
    print ("Val dataset size is {}".format(len(val_data_indices)))
    print ("Test dataset size is {}".format(len(test_data_indices)))

    print(train_data[0])
    print(train_data_indices[0])


In [None]:
#define our dataset for our investigations
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list, max_sentenance_length):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        self.max_sentenance_length = max_sentenance_length
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:self.max_sentenance_length]
        token_len = len(token_idx)
        
        token_idx = np.pad(np.array(token_idx), 
                                pad_width=((0,self.max_sentenance_length-token_len)), 
                                mode="constant", constant_values=0)
        
        
        label = self.target_list[key]
        return [token_idx, token_len, label]

        
if (run_inline):
    BATCH_SIZE = 32
    MAX_SENTENANCE_LENGTH = 400
    train_dataset = NewsGroupDataset(train_data_indices, train_targets, MAX_SENTENANCE_LENGTH)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    val_dataset = NewsGroupDataset(val_data_indices, val_targets, MAX_SENTENANCE_LENGTH)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)

    test_dataset = NewsGroupDataset(test_data_indices, test_targets, MAX_SENTENANCE_LENGTH)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

#this defines our bag of words model, which uses a linear combination of embeddings
class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,2)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data.long())
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out



In [None]:

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        labels = labels.long()
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct = correct + predicted.long().eq(labels.view_as(predicted.long())).sum().item()
    return (100 * correct / total)


def run_model(verbose = True, num_epochs = 10):
    results = []
    for epoch in range(num_epochs):
        if (scheduler):
            scheduler.step()
        for i, (data, lengths, labels) in enumerate(train_loader):
            model.train()
            data_batch, length_batch, label_batch = data, lengths, labels.long()
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()
            # validate every 100 iterations
            if i > 0 and i % 300 == 0 and verbose:
                # validate
                val_acc = test_model(val_loader, model)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                               epoch+1, num_epochs, i+1, len(train_loader), val_acc))
        train_acc = test_model(train_loader, model)
        val_acc = test_model(val_loader, model)
        results.append([train_acc,val_acc])
    return results

In [None]:
import matplotlib.pylab as plt
import matplotlib.ticker as ticker

#Display a chart of the data, given the label, a list of runs, and the number of 
#columns to display the legend.
#assumes data is in {paramaters1:[[E1-trainacc,E1-valacc],[E2-trainacc,E2-valacc]...]}
def show_chart(plot_label, res_run, cols):

    fig, ax = plt.subplots(figsize=(12, 6))

    for key in res_run:
        arun = np.array(res_run[key])
        y = arun[:,0] # unpack a list of pairs into two tuples
        x = range(1,len(y)+1)
        ax.plot(x, y, label = key + ' train', linestyle="--")
        y = arun[:,1] # unpack a list of pairs into two tuples
        x = range(1,len(y)+1)
        ax.plot(x, y, label = key + ' validate', linestyle="-")


    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.grid('on')
    plt.title(plot_label)
    plt.xlabel("Epoch")
    plt.ylabel("Percent Accuracy")
    plt.legend()    
    handles, labels = ax.get_legend_handles_labels()
    lgd = ax.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5,-0.2), ncol=cols)
    plt.savefig(plot_label, bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.show()

In [None]:
#save our data after each test, useful if we need to redo graphs or tables, etc.
import pickle

def save_data(run_name, res_run):

    with open(run_name + '.pkl', 'wb') as f:
        pickle.dump(res_run, f)


In [None]:
#load and build a new chart
def graph_pickle(run_name, slots):
    with open(run_name + '.pkl', 'rb') as f:
        res_run = pickle.load(f)
    show_chart(run_name, res_run,slots)
    
graph_pickle('pre-processing', 5)

In [None]:
#######general format of a run, used as my inital test and the template for subsequent runs ####
'''
#assume data has been loaded

#if we need to retokenize
train_tokens, val_tokens, test_tokens, all_train_tokens = tokenize_data(td_replaceNumbers=True, 
                                                                        td_toLower=True, 
                                                                        td_removePunct=True, 
                                                                        td_ngrams=1) 
                                                                        

#if we want to change vocab size
MAX_VOCAB_SIZE = 30000
token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = MAX_VOCAB_SIZE)
train_data_indices = token2index_dataset(train_tokens)
val_data_indices = token2index_dataset(val_tokens)
test_data_indices = token2index_dataset(test_tokens)


#if we want to change sentance length or batch size
BATCH_SIZE = 32
MAX_SENTENANCE_LENGTH = 400
train_dataset = NewsGroupDataset(train_data_indices, train_targets, MAX_SENTENANCE_LENGTH)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_targets, MAX_SENTENANCE_LENGTH)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, test_targets, MAX_SENTENANCE_LENGTH)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Criterion 
criterion = torch.nn.CrossEntropyLoss()


#some more hyper paramaters
emb_dim = 100
learning_rate = 0.001
num_epochs = 10 # number epoch to train

# Create model and optimizer, must redo for any change
model = BagOfWords(len(id2token), emb_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

run_model(verbose=False)

print ("After training for {} epochs".format(num_epochs), ": Val Acc {}".format(test_model(val_loader, model)))
#print ("Test Acc {}".format(test_model(test_loader, model)))
'''

In [None]:
#assume data has been loaded

#iterate over various pre-processing paramaters
run_results = {}

for number_scheme in ["nothing", "replace", "remove"]:
    for lower in [True, False]:
        for removePunct in [True, False]:
    
                replaceNumbers = False
                removeNumbers = False

                if number_scheme == "replace":
                    replaceNumbers = True
                elif number_scheme == "remove":
                    removeNumbers = True
                    
                    
                #if we need to retokenize
                train_tokens, val_tokens, test_tokens, all_train_tokens = tokenize_data(td_replaceNumbers=replaceNumbers, 
                                                                                        td_removeNumbers=removeNumbers, 
                                                                                        td_toLower=lower, 
                                                                                        td_removePunct=removePunct, 
                                                                                        td_ngrams=1) 
                                                                                        
                #if we want to change vocab size
                MAX_VOCAB_SIZE = 20000
                token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = MAX_VOCAB_SIZE)
                train_data_indices = token2index_dataset(train_tokens)
                val_data_indices = token2index_dataset(val_tokens)
                test_data_indices = token2index_dataset(test_tokens)


                #if we want to change sentance length or batch size
                BATCH_SIZE = 32
                MAX_SENTENANCE_LENGTH = 200
                train_dataset = NewsGroupDataset(train_data_indices, train_targets, MAX_SENTENANCE_LENGTH)
                train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)

                val_dataset = NewsGroupDataset(val_data_indices, val_targets, MAX_SENTENANCE_LENGTH)
                val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)

                test_dataset = NewsGroupDataset(test_data_indices, test_targets, MAX_SENTENANCE_LENGTH)
                test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

                # Criterion 
                criterion = torch.nn.CrossEntropyLoss()


                #some more hyper paramaters
                emb_dim = 50
                learning_rate = 0.001
                num_epochs = 10 # number epoch to train

                # Create model and optimizer, must redo for any change
                model = BagOfWords(len(id2token), emb_dim)
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
                #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

                res = run_model(verbose=False)

                key = ""
                key += "remove_numbers-" if removeNumbers else ""
                key += "replace_numbers:" if replaceNumbers else ""
                key += "lowercase" if lower else "mix_case"
                key += "no_punctuation" if removePunct else "include_punctuation"

                run_results[key] = res
                
                print("removeNumbers:", removeNumbers, " , replaceNumbers:", replaceNumbers, " , lower:", lower, " , removePunct:", removePunct, " , train Acc {}".format(test_model(train_loader, model)), " , Val Acc {}".format(test_model(val_loader, model)), sep='')
    
save_data("pre-processing", run_results)
show_chart("pre-processing",run_results,4)


MAX_VOCAB_SIZE = 20000
BATCH_SIZE = 32
MAX_SENTENANCE_LENGTH = 200
emb_dim = 50
learning_rate = 0.001
num_epochs = 10 # number epoch to train

removeNumbers:False , replaceNumbers:False , lower:True , removePunct:True , train Acc 97.135 , Val Acc 87.48
removeNumbers:False , replaceNumbers:False , lower:True , removePunct:False , train Acc 96.195 , Val Acc 87.2
removeNumbers:False , replaceNumbers:False , lower:False , removePunct:True , train Acc 97.085 , Val Acc 87.54
removeNumbers:False , replaceNumbers:False , lower:False , removePunct:False , train Acc 96.545 , Val Acc 86.9
removeNumbers:False , replaceNumbers:True , lower:True , removePunct:True , train Acc 96.945 , Val Acc 87.52
removeNumbers:False , replaceNumbers:True , lower:True , removePunct:False , train Acc 96.2 , Val Acc 87.24
removeNumbers:False , replaceNumbers:True , lower:False , removePunct:True , train Acc 97.13 , Val Acc 87.04
removeNumbers:False , replaceNumbers:True , lower:False , removePunct:False , train Acc 96.46 , Val Acc 86.64
removeNumbers:True , replaceNumbers:False , lower:True , removePunct:True , train Acc 96.795 , Val Acc 87.48
removeNumbers:True , replaceNumbers:False , lower:True , removePunct:False , train Acc 95.905 , Val Acc 86.94
removeNumbers:True , replaceNumbers:False , lower:False , removePunct:True , train Acc 96.87 , Val Acc 87.32
removeNumbers:True , replaceNumbers:False , lower:False , removePunct:False , train Acc 96.195 , Val Acc 87.12


In [None]:
#assume data has been loaded

#not a wide range of differences for the tokenization, we'll pick the best on the last run 
#-removeNumbers:False , replaceNumbers:False , lower:True , removePunct:True
run_results = {}

#iterate over the number of n-grams and vocab size
for grams in (1,2,3,4):
                    
                    
    #if we need to retokenize
    train_tokens, val_tokens, test_tokens, all_train_tokens = tokenize_data(td_replaceNumbers=False, 
                                                                            td_removeNumbers=False, 
                                                                            td_toLower=True, 
                                                                            td_removePunct=True, 
                                                                            td_ngrams=grams)
    
    for vocab in (10000, 20000, 30000, 40000, 50000, 100000, 200000):
    
        token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = vocab)
        train_data_indices = token2index_dataset(train_tokens)
        val_data_indices = token2index_dataset(val_tokens)
        test_data_indices = token2index_dataset(test_tokens)


        #if we want to change sentance length or batch size
        BATCH_SIZE = 32
        MAX_SENTENANCE_LENGTH = 200
        train_dataset = NewsGroupDataset(train_data_indices, train_targets, MAX_SENTENANCE_LENGTH)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)

        val_dataset = NewsGroupDataset(val_data_indices, val_targets, MAX_SENTENANCE_LENGTH)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)

        test_dataset = NewsGroupDataset(test_data_indices, test_targets, MAX_SENTENANCE_LENGTH)
        test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

        # Criterion 
        criterion = torch.nn.CrossEntropyLoss()


        #some more hyper paramaters
        emb_dim = 50
        learning_rate = 0.001
        num_epochs = 10 # number epoch to train

        # Create model and optimizer, must redo for any change
        model = BagOfWords(len(id2token), emb_dim)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

        arun = run_model(verbose=False)
        run_label = str(grams)+ "-gram," + str(vocab) + "vocab size"
        
        run_results[run_label] = arun
        
        print("n-gram size:", grams, " , vocab size:" , vocab, " , train Acc {}".format(test_model(train_loader, model)), " , Val Acc {}".format(test_model(val_loader, model)), sep='')

save_data("tokenization-vocab", run_results)
show_chart("tokenization-vocab", run_results,4)


n-gram size:1 , vocab size:10000 , train Acc 94.345 , Val Acc 87.06
n-gram size:1 , vocab size:20000 , train Acc 97.22 , Val Acc 87.4
n-gram size:1 , vocab size:30000 , train Acc 97.965 , Val Acc 87.38
n-gram size:1 , vocab size:40000 , train Acc 98.415 , Val Acc 87.34
n-gram size:1 , vocab size:50000 , train Acc 98.645 , Val Acc 87.2
n-gram size:1 , vocab size:100000 , train Acc 99.075 , Val Acc 87.56
n-gram size:2 , vocab size:10000 , train Acc 92.8 , Val Acc 86.5
n-gram size:2 , vocab size:20000 , train Acc 95.58 , Val Acc 86.4
n-gram size:2 , vocab size:30000 , train Acc 96.83 , Val Acc 87.44
n-gram size:2 , vocab size:40000 , train Acc 97.915 , Val Acc 87.56
n-gram size:2 , vocab size:50000 , train Acc 98.35 , Val Acc 87.32
n-gram size:2 , vocab size:100000 , train Acc 99.285 , Val Acc 87.44
n-gram size:3 , vocab size:10000 , train Acc 93.235 , Val Acc 86.56
n-gram size:3 , vocab size:20000 , train Acc 95.395 , Val Acc 87.72
n-gram size:3 , vocab size:30000 , train Acc 96.75 , Val Acc 87.32
n-gram size:3 , vocab size:40000 , train Acc 97.505 , Val Acc 87.54
n-gram size:3 , vocab size:50000 , train Acc 97.845 , Val Acc 87.58
n-gram size:3 , vocab size:100000 , train Acc 98.725 , Val Acc 88.0
n-gram size:4 , vocab size:10000 , train Acc 93.345 , Val Acc 87.12
n-gram size:4 , vocab size:20000 , train Acc 95.52 , Val Acc 87.46
n-gram size:4 , vocab size:30000 , train Acc 96.65 , Val Acc 87.06
n-gram size:4 , vocab size:40000 , train Acc 97.025 , Val Acc 87.5
n-gram size:4 , vocab size:50000 , train Acc 97.54 , Val Acc 87.56
n-gram size:4 , vocab size:100000 , train Acc 98.12 , Val Acc 87.78

In [None]:
#-removeNumbers:False , replaceNumbers:False , lower:True , removePunct:True
# n-gram size:3 , vocab size:40000 
                    
#iterate over different max sentence length values
    
#if we need to retokenize
train_tokens, val_tokens, test_tokens, all_train_tokens = tokenize_data(td_replaceNumbers=False, 
                                                                        td_removeNumbers=False, 
                                                                        td_toLower=True, 
                                                                        td_removePunct=True, 
                                                                        td_ngrams=3)


token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = 40000)
train_data_indices = token2index_dataset(train_tokens)
val_data_indices = token2index_dataset(val_tokens)
test_data_indices = token2index_dataset(test_tokens)

run_results = {}

for slength in [50,100,150,200,250,300,350,400,450,500]:
    BATCH_SIZE = 32
    
    train_dataset = NewsGroupDataset(train_data_indices, train_targets, slength)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    val_dataset = NewsGroupDataset(val_data_indices, val_targets, slength)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)

    test_dataset = NewsGroupDataset(test_data_indices, test_targets, slength)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Criterion 
    criterion = torch.nn.CrossEntropyLoss()


    #some more hyper paramaters
    emb_dim = 50
    learning_rate = 0.001
    num_epochs = 10 # number epoch to train

    # Create model and optimizer, must redo for any change
    model = BagOfWords(len(id2token), emb_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    arun = run_model(verbose=False)

    reslabel = str(slength) + " max sentence"
    
    run_results[reslabel] = arun
    
    print("Max Sentence Length:", slength, " , train Acc {}".format(test_model(train_loader, model)), " , Val Acc {}".format(test_model(val_loader, model)), sep='')

save_data("sentence-length", run_results)
show_chart("sentence-length", run_results,2)
    

Max Sentence Length:50 , train Acc 94.67 , Val Acc 78.34
Max Sentence Length:100 , train Acc 97.075 , Val Acc 83.88
Max Sentence Length:150 , train Acc 97.97 , Val Acc 86.76
Max Sentence Length:200 , train Acc 98.51 , Val Acc 87.58
Max Sentence Length:250 , train Acc 98.925 , Val Acc 88.42
Max Sentence Length:300 , train Acc 98.965 , Val Acc 89.06
Max Sentence Length:350 , train Acc 99.16 , Val Acc 89.24
Max Sentence Length:400 , train Acc 99.135 , Val Acc 89.34
Max Sentence Length:450 , train Acc 99.185 , Val Acc 89.72
Max Sentence Length:500 , train Acc 99.135 , Val Acc 89.6

In [None]:
#-removeNumbers:False , replaceNumbers:False , lower:True , removePunct:True
# n-gram size:3 , vocab size:40000 
# Max Sentence Length:500 

run_results = {}
            
'''
#if we need to retokenize
train_tokens, val_tokens, test_tokens, all_train_tokens = tokenize_data(td_replaceNumbers=False, 
                                                                        td_removeNumbers=False, 
                                                                         td_toLower=True, 
                                                                        td_removePunct=True, 
                                                                        td_ngrams=3)


token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = 40000)
train_data_indices = token2index_dataset(train_tokens)
val_data_indices = token2index_dataset(val_tokens)
test_data_indices = token2index_dataset(test_tokens)
'''

slength = 500

batch_size = 32

for emb_dim in (10,25,50,100,150,200):

       
    train_dataset = NewsGroupDataset(train_data_indices, train_targets, slength)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size= batch_size, shuffle=True)

    val_dataset = NewsGroupDataset(val_data_indices, val_targets, slength)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size= batch_size, shuffle=True)

    test_dataset = NewsGroupDataset(test_data_indices, test_targets, slength)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size= batch_size, shuffle=False)

    # Criterion 
    criterion = torch.nn.CrossEntropyLoss()


    #some more hyper paramaters
    learning_rate = 0.001
    num_epochs = 10 # number epoch to train

    # Create model and optimizer, must redo for any change
    model = BagOfWords(len(id2token), emb_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    arun = run_model(verbose=False)

    res_label = str( emb_dim ) + " embedding size"
    run_results[res_label] = arun
    
    print("Embedding Size:",  emb_dim, " , train Acc {}".format(test_model(train_loader, model)), " , Val Acc {}".format(test_model(val_loader, model)), sep='')

save_data("embeddingsize", run_results)
show_chart("embeddingsize", run_results,2)


In [None]:
#-removeNumbers:False , replaceNumbers:False , lower:True , removePunct:True
# n-gram size:3 , vocab size:40000 
# Max Sentence Length:400 

#adam

run_results = {}
            

#if we need to retokenize
train_tokens, val_tokens, test_tokens, all_train_tokens = tokenize_data(td_replaceNumbers=False, 
                                                                        td_removeNumbers=False, 
                                                                         td_toLower=True, 
                                                                        td_removePunct=True, 
                                                                        td_ngrams=3)


token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = 40000)
train_data_indices = token2index_dataset(train_tokens)
val_data_indices = token2index_dataset(val_tokens)
test_data_indices = token2index_dataset(test_tokens)

slength = 500
batch_size = 32
emb_dim = 100
 
train_dataset = NewsGroupDataset(train_data_indices, train_targets, slength)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size= batch_size, shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_targets, slength)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size= batch_size, shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, test_targets, slength)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size= batch_size, shuffle=False)

# Criterion 
criterion = torch.nn.CrossEntropyLoss()

for learning_rate in (0.5, 0.1, 0.05, 0.01, 0.005, 0.001):
    for num_epochs in (2,5,10,15):

        #some more hyper paramaters

        # Create model and optimizer, must redo for any change
        model = BagOfWords(len(id2token), emb_dim)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

        arun = run_model(verbose=False, num_epochs = num_epochs)

        res_label = str( learning_rate ) + " learning rate and " + str(num_epochs) + " epochs" 
        run_results[res_label] = arun

        print("Learning rate:",  learning_rate, " epochs:", num_epochs, " , train Acc {}".format(test_model(train_loader, model)), " , Val Acc {}".format(test_model(val_loader, model)), sep='')

save_data("learning-epoch", run_results)
show_chart("learning-epoch", run_results,6)


In [None]:
#-removeNumbers:False , replaceNumbers:False , lower:True , removePunct:True
# n-gram size:3 , vocab size:40000 
# Max Sentence Length:400 

#SGD


run_results = {}
            

#if we need to retokenize
train_tokens, val_tokens, test_tokens, all_train_tokens = tokenize_data(td_replaceNumbers=False, 
                                                                        td_removeNumbers=False, 
                                                                         td_toLower=True, 
                                                                        td_removePunct=True, 
                                                                        td_ngrams=3)


token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = 40000)
train_data_indices = token2index_dataset(train_tokens)
val_data_indices = token2index_dataset(val_tokens)
test_data_indices = token2index_dataset(test_tokens)

slength = 500
batch_size = 32
emb_dim = 100
 
train_dataset = NewsGroupDataset(train_data_indices, train_targets, slength)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size= batch_size, shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_targets, slength)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size= batch_size, shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, test_targets, slength)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size= batch_size, shuffle=False)

# Criterion 
criterion = torch.nn.CrossEntropyLoss()

for learning_rate in (0.5, 0.1, 0.05, 0.01):
    for num_epochs in (10,100,200,400):

        #some more hyper paramaters

        # Create model and optimizer, must redo for any change
        model = BagOfWords(len(id2token), emb_dim)
        #optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

        arun = run_model(verbose=False, num_epochs = num_epochs)

        res_label = str( learning_rate ) + " learning rate and " + str(num_epochs) + " epochs" 
        run_results[res_label] = arun

        print("Learning rate:",  learning_rate, " epochs:", num_epochs, " , train Acc {}".format(test_model(train_loader, model)), " , Val Acc {}".format(test_model(val_loader, model)), sep='')

save_data("sgd-learning-epoch", run_results)
show_chart("sgd-learning-epoch", run_results,6)


In [None]:
#-removeNumbers:False , replaceNumbers:False , lower:True , removePunct:True
# n-gram size:3 , vocab size:40000 
# Max Sentence Length:400 

#SGD


run_results = {}
            
'''
#if we need to retokenize
train_tokens, val_tokens, test_tokens, all_train_tokens = tokenize_data(td_replaceNumbers=False, 
                                                                        td_removeNumbers=False, 
                                                                         td_toLower=True, 
                                                                        td_removePunct=True, 
                                                                        td_ngrams=3)


token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = 40000)
train_data_indices = token2index_dataset(train_tokens)
val_data_indices = token2index_dataset(val_tokens)
test_data_indices = token2index_dataset(test_tokens)
'''
slength = 500
batch_size = 32
emb_dim = 100
 
train_dataset = NewsGroupDataset(train_data_indices, train_targets, slength)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size= batch_size, shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_targets, slength)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size= batch_size, shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, test_targets, slength)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size= batch_size, shuffle=False)

# Criterion 
criterion = torch.nn.CrossEntropyLoss()



for learning_rate in (1, 0.5 ):
    for num_epochs in (2, 10, 100, 200): 

        #some more hyper paramaters

        # Create model and optimizer, must redo for any change
        model = BagOfWords(len(id2token), emb_dim)
        #optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

        lambda1 = lambda epoch: 0.97 ** epoch
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1])
        
        arun = run_model(verbose=False, num_epochs = num_epochs)

        res_label = str( learning_rate ) + " learning rate and " + str(num_epochs) + " epochs" 
        run_results[res_label] = arun

        print("Learning rate:",  learning_rate, " epochs:", num_epochs, " , train Acc {}".format(test_model(train_loader, model)), " , Val Acc {}".format(test_model(val_loader, model)), sep='')

save_data("sgd-linear-learning-epoch", run_results)
show_chart("sgd-linear-learning-epoch", run_results,6)


In [None]:

#show some specific results to look at examples
#remove shuffle so we can index the inital text
val_loader2 = torch.utils.data.DataLoader(dataset=NewsGroupDataset(val_data_indices, val_targets, MAX_SENTENANCE_LENGTH), batch_size=BATCH_SIZE, shuffle=False)


model.eval()
runs = 0
for data, lengths, labels in val_loader2:
    labels = labels.long()
    data_batch, length_batch, label_batch = data, lengths, labels
    outputs = F.softmax(model(data_batch, length_batch), dim=1)
    predicted = outputs.max(1, keepdim=True)[1]
    
    for i in range(0,len(labels)):
        print( i,predicted[i],"==" if predicted[i] == labels[i] else "!", labels[i])
    
    if runs > 1:
        break
    runs += 1


In [None]:
#view a review
i = 27
print(i,predicted[i],"==", labels[i])
print(val_targets[i], val_reviews[i])
