In [1]:
import argparse
import time
import math
import numpy as np
import torch
import torch.nn as nn

import data
import model

from utils import batchify, get_batch, repackage_hidden

parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model')
parser.add_argument('--data', type=str, default='data/clean_wiki/',
                    help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM',
                    help='type of recurrent net (LSTM, QRNN, GRU)')
parser.add_argument('--emsize', type=int, default=400,
                    help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=1150,
                    help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=3,
                    help='number of layers')
parser.add_argument('--lr', type=float, default=30,
                    help='initial learning rate')
parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
parser.add_argument('--epochs', type=int, default=8000,
                    help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=80, metavar='N',
                    help='batch size')
parser.add_argument('--bptt', type=int, default=70,
                    help='sequence length')
parser.add_argument('--dropout', type=float, default=0.4,
                    help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--dropouth', type=float, default=0.3,
                    help='dropout for rnn layers (0 = no dropout)')
parser.add_argument('--dropouti', type=float, default=0.65,
                    help='dropout for input embedding layers (0 = no dropout)')
parser.add_argument('--dropoute', type=float, default=0.1,
                    help='dropout to remove words from embedding layer (0 = no dropout)')
parser.add_argument('--wdrop', type=float, default=0.5,
                    help='amount of weight dropout to apply to the RNN hidden to hidden matrix')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--nonmono', type=int, default=5,
                    help='random seed')
parser.add_argument('--cuda', action='store_false', default=False,
                    help='use CUDA')
parser.add_argument('--log-interval', type=int, default=200, metavar='N',
                    help='report interval')
randomhash = ''.join(str(time.time()).split('.'))
parser.add_argument('--save', type=str,  default=randomhash+'.pt',
                    help='path to save the final model')
parser.add_argument('--alpha', type=float, default=2,
                    help='alpha L2 regularization on RNN activation (alpha = 0 means no regularization)')
parser.add_argument('--beta', type=float, default=1,
                    help='beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)')
parser.add_argument('--wdecay', type=float, default=1.2e-6,
                    help='weight decay applied to all weights')
parser.add_argument('--resume', type=str,  default='',
                    help='path of model to resume')
parser.add_argument('--optimizer', type=str,  default='sgd',
                    help='optimizer to use (sgd, adam)')
parser.add_argument('--when', nargs="+", type=int, default=[-1],
                    help='When (which epochs) to divide the learning rate by 10 - accepts multiple')
parser.add_argument('-f')
args = parser.parse_args()
args.tied = True

# Set the random seed manually for reproducibility.
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    else:
        torch.cuda.manual_seed(args.seed)



In [87]:
import os
import torch

from collections import Counter


class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.counter = Counter()
        self.total = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        token_id = self.word2idx[word]
        self.counter[token_id] += 1
        self.total += 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

    
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path, keep_sentence_boundaries=True):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        
        Max_Length = 0
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                if len(words) > Max_Length:
                        Max_Length = len(words)
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        if not keep_sentence_boundaries:
            encoded_sentences = []
            with open(path, 'r') as f:
                ids = torch.LongTensor(tokens)
                token = 0
                for line in f:
                    encoded_sentence = []
                    words = line.split() + ['<eos>']
                    
                    for word in words:
                        ids[token] = self.dictionary.word2idx[word]
                        token += 1
        else:
            encoded_sentences = []
            with open(path, 'r') as f:
                token = 0
                for line in f:
                    encsentence = []
                    words = line.split() + ['<eos>']
                    for word in words:
                        encsentence.append(self.dictionary.word2idx[word])
                        
                    if (Max_Length - len(encsentence)) > 0:
                        encsentence = torch.LongTensor(encsentence)
                        encsentence = torch.nn.functional.pad(encsentence, pad=(0,Max_Length - len(encsentence)))
                    else:
                        encsentence = torch.LongTensor(encsentence)

                    encoded_sentences.append(encsentence)
            ids = torch.stack(encoded_sentences)
            
        return ids

In [88]:
corpus = Corpus("../data/clean_wiki")

In [89]:
print(corpus.valid.shape)

torch.Size([15500, 67])


In [84]:
torch.stack(corpus.valid).shape

torch.Size([15500, 67])

In [93]:
def batchify(data, bsz, args):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    if args.cuda:
        data = data.cuda()
    return data

def sentence_based_batchify(data, args):
    return data

In [4]:
###############################################################################
# Load data
###############################################################################

def model_save(fn):
    with open(fn, 'wb') as f:
        torch.save([model, criterion, optimizer], f)

def model_load(fn):
    global model, criterion, optimizer
    with open(fn, 'rb') as f:
        model, criterion, optimizer = torch.load(f)

import os
import hashlib
fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest())
if os.path.exists(fn):
    print('Loading cached dataset...')
    corpus = torch.load(fn)
else:
    print('Producing dataset...')
    corpus = data.Corpus(args.data)
    torch.save(corpus, fn)

eval_batch_size = 10
test_batch_size = 1

val_data = batchify(corpus.valid, eval_batch_size, args)


Loading cached dataset...


In [7]:
corpus.valid.shape

torch.Size([218448])

In [32]:

x = torch.nn.functional.pad(torch.LongTensor([1,2]), pad=(0,1))
print(x)
#x = []
#for c in x1:
#    x.append(torch.tensor(c))
#x = torch.nn.utils.rnn.pad_sequence(x1, batch_first=True, padding_value=0)

#X_lengths = [2,1,3]
#X = torch.nn.utils.rnn.pack_padded_sequence(x, X_lengths, batch_first=True)

tensor([ 1,  2,  0])


In [28]:
x

ZeroPad2d(padding=[[1, 2], [1, 2, 3]], value=0)

In [14]:
x = torch.tensor([[1,2],[1],[1,2,3]])


ValueError: expected sequence of length 2 at dim 1 (got 1)

Collecting spacy
[33m  Cache entry deserialization failed, entry ignored[0m
  Downloading https://files.pythonhosted.org/packages/5c/38/fc37ad63427e9781e4bf5f350f9a1b9e472b3e48bc856ada9ace7fcf1b7d/spacy-2.1.8-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (34.7MB)
[K    100% |████████████████████████████████| 34.7MB 23kB/s eta 0:00:011  7% |██▎                             | 2.5MB 10.4MB/s eta 0:00:04    49% |███████████████▊                | 17.0MB 7.6MB/s eta 0:00:03
[?25hCollecting preshed<2.1.0,>=2.0.1 (from spacy)
[33m  Cache entry deserialization failed, entry ignored[0m
[33m  Cache entry deserialization failed, entry ignored[0m
  Downloading https://files.pythonhosted.org/packages/cb/5a/4b6f2035443e463a326e3a81863f2b4850be76b4538fd4931b1aa63f79b6/preshed-2.0.1-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (148kB)
[K    100% |███████████████████████

In [8]:
from torchtext.data import Field
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

In [48]:
import torchtext
from torchtext import data
import spacy

# tokenizer function using spacy
def tokenizer(s): 
    return [w.lower() for w in s]



# define the columns that we want to process and how to process
txt_field = data.Field(sequential=True, 
                       tokenize=tokenizer, 
                       include_lengths=True, 
                       use_vocab=False)
train_val_fields = [
    ('Text', txt_field) # process it as text
]

trainds, valds = data.TabularDataset.splits(path='../data/clean_wiki', 
                                            format='csv', 
                                            train='train.csv', 
                                            validation='valid.csv', 
                                            fields=train_val_fields, 
                                            skip_header=True)

In [49]:
valds.examples[0]

<torchtext.data.example.Example at 0x120e88160>

In [50]:

traindl, valdl = data.BucketIterator.splits(datasets=(trainds, valds), # specify train and validation Tabulardataset
                                            batch_sizes=(10,10),  # batch size of train and validation
                                            sort_key=lambda x: len(x.Text), # on what attribute the text should be sorted
                                            device=None, # -1 mean cpu and 0 or None mean gpu
                                            sort_within_batch=True, 
                                            repeat=False)

In [51]:
print(len(traindl), len(valdl))

0 1550


In [47]:
batch = next(iter(valdl))

AttributeError: 'Field' object has no attribute 'vocab'

In [91]:
eval_batch_size = 10
test_batch_size = 1
val_data = batchify(corpus.valid, eval_batch_size, args)

print(val_data.shape)

torch.Size([103850, 10])


In [94]:
val_data = sentence_based_batchify(corpus.valid, args)
print(val_data.shape)

torch.Size([15500, 67])
