In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [None]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [None]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<unk>"], seq))
    return LongTensor(idxs)

In [None]:
def prepare_ptb_dataset(filename, word2index=None):
    corpus = open(filename, 'r', encoding='utf-8').readlines()
    corpus = flatten([co.strip().split() + ['</s>'] for co in corpus])
    
    if word2index == None:
        vocab = list(set(corpus))
        word2index = {'<unk>': 0}
        for vo in vocab:
            if word2index.get(vo) is None:
                word2index[vo] = len(word2index)
    
    return prepare_sequence(corpus, word2index), word2index

In [None]:
def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).contiguous()
    if USE_CUDA:
        data = data.cuda()
    return data

In [None]:
def getBatch(data, seq_length):
    #print (data.size(0))
    for i in range(0, data.size(1) - seq_length, seq_length):
        inputs = Variable(data[:, i: i + seq_length])
        targets = Variable(data[:, (i + 1): (i + 1) + seq_length].contiguous())
        yield (inputs, targets)

In [None]:
train_data, word2index = prepare_ptb_dataset('non_spam.txt', word2index=None)
dev_data , _ = prepare_ptb_dataset('valid.txt', word2index)
test_data, _ = prepare_ptb_dataset('test.txt', word2index)

In [None]:
len(word2index)

In [None]:
index2word = {v:k for k, v in word2index.items()}

In [None]:
word2index['the']

In [None]:
index2word[31562]

In [None]:
'''
import pickle

pickle_out = open("w2i.pickle","wb")
pickle.dump(word2index, pickle_out)
pickle_out.close()

pickle_out = open("i2w.pickle","wb")
pickle.dump(index2word, pickle_out)
pickle_out.close()
'''

In [None]:
import pickle
pickle_in = open("w2i.pickle","rb")
word2index = pickle.load(pickle_in)

pickle_in = open("i2w.pickle","rb")
index2word = pickle.load(pickle_in)

In [None]:
word2index['the']

In [None]:
em_sz,nh,nl = 400,500,2

In [None]:
#PRE_PATH = PATH/'models'/'wt103'
PRE_LM_PATH = 'fwd_wt103.h5'

In [None]:
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [None]:
enc_wgts = (wgts['0.encoder.weight'].numpy())
row_m = enc_wgts.mean(0)
#print(row_m)

In [None]:
#import pickle
import collections
file = open('/home/shivam/Documents/ether_language_model/itos_wt103.pkl', 'rb')
itos2 = pickle.load(file)
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

In [None]:
stoi2['apexwire']

In [None]:
enc_wgts[27319].size

In [None]:
new_w = np.zeros((len(word2index), em_sz), dtype=np.float32)
for i,w in enumerate(word2index):
    r = stoi2[w]
    #print (i,w,r)
    new_w[i] = enc_wgts[r] if r>=0 else row_m

In [None]:
np.array_equal(new_w[1567],row_m)

In [None]:
class LanguageModel(nn.Module): 
    def __init__(self, vocab_size, embedding_size, hidden_size, n_layers=1, dropout_p=0.5):

        super(LanguageModel, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout_p)
        
    def init_weight(self):
        #self.embed.weight = nn.init.xavier_uniform(self.embed.weight)
        self.embed.weight.data.copy_(torch.from_numpy(new_w))
        self.linear.weight = nn.init.xavier_uniform(self.linear.weight)
        self.linear.bias.data.fill_(0)
        
    def init_hidden(self,batch_size):
        hidden = Variable(torch.zeros(self.n_layers,batch_size,self.hidden_size))
        context = Variable(torch.zeros(self.n_layers,batch_size,self.hidden_size))
        return (hidden.cuda(), context.cuda()) if USE_CUDA else (hidden, context)
    
    def detach_hidden(self, hiddens):
        return tuple([hidden.detach() for hidden in hiddens])
    
    def forward(self, inputs, hidden, is_training=False): 

        embeds = self.embed(inputs)
        if is_training:
            embeds = self.dropout(embeds)
        out,hidden = self.rnn(embeds, hidden)
        return self.linear(out.contiguous().view(out.size(0) * out.size(1), -1)), hidden

In [None]:
EMBED_SIZE = 400
HIDDEN_SIZE = 200
NUM_LAYER = 2
LR = 0.001
SEQ_LENGTH = 10 # for bptt
BATCH_SIZE = 32
EPOCH = 20
RESCHEDULED = False

In [None]:
train_data = batchify(train_data, BATCH_SIZE)
dev_data = batchify(dev_data, BATCH_SIZE//2)
test_data = batchify(test_data, BATCH_SIZE//2)

In [None]:
model = LanguageModel(len(word2index), EMBED_SIZE, HIDDEN_SIZE, NUM_LAYER, 0.5)
model.init_weight() 
if USE_CUDA:
    model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [None]:
for epoch in range(EPOCH):
    total_loss = 0
    losses = []
    hidden = model.init_hidden(BATCH_SIZE)
    for i,batch in enumerate(getBatch(train_data, SEQ_LENGTH)):
        inputs, targets = batch
        hidden = model.detach_hidden(hidden)
        model.zero_grad()
        preds, hidden = model(inputs, hidden, True)

        loss = loss_function(preds, targets.view(-1))
        losses.append(loss.data.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5) # gradient clipping
        optimizer.step()

        if i > 0 and i % 500 == 0:
            print("[%02d/%d] mean_loss : %0.2f, Perplexity : %0.2f" % (epoch,EPOCH, np.mean(losses), np.exp(np.mean(losses))))
            losses = []
        
    # learning rate anealing
    if RESCHEDULED == False and epoch == EPOCH//2:
        LR *= 0.1
        optimizer = optim.Adam(model.parameters(), lr=LR)
        RESCHEDULED = True

In [None]:
torch.save(model, 'model.ckpt')

In [None]:
torch.cuda.is_available()

In [None]:
model = torch.load("model.ckpt")

In [None]:
total_loss = 0
hidden = model.init_hidden(BATCH_SIZE//2)
for batch in getBatch(test_data, SEQ_LENGTH):
    inputs,targets = batch
        
    hidden = model.detach_hidden(hidden)
    model.zero_grad()
    preds, hidden = model(inputs, hidden)
    total_loss += inputs.size(1) * loss_function(preds, targets.view(-1)).data

total_loss = total_loss.item()/test_data.size(1)
print("Test Perpelexity : %5.2f" % (np.exp(total_loss)))

In [None]:
num_samples=100
with torch.no_grad():
    with open('sample.txt', 'w') as f:
        
        hidden = model.init_hidden(1)
        prob = torch.ones(len(word2index))
        input = torch.multinomial(prob, num_samples=1).unsqueeze(1).cuda()
        print (input.shape)
        for i in range(num_samples):
            # Forward propagate RNN 
            output, hidden = model(input, hidden)
            #print (output.shape)

            # Sample a word id
            prob = output.exp()
            word_id = torch.multinomial(prob, num_samples=1).item()
            #print (prob)
            # Fill input with sampled word id for the next time step
            input.fill_(word_id)

            # File write
            word = index2word[word_id]
            word = '\n' if word == '<eos>' else word + ' '
            f.write(word)
            
            if (i+1) % 100 == 0:
                print('Sampled [{}/{}] words and save to {}'.format(i+1, num_samples, 'sample.txt'))

In [None]:
torch.cuda.empty_cache()

In [None]:

data = ["This is great and auspicious occasion"]
data = flatten([co.strip().split() + ['</s>'] for co in data])
x = prepare_sequence(data,word2index)
x = x.unsqueeze(1)
x = batchify(x,1)

with torch.no_grad():
        
    hidden = model.init_hidden(1)
    for batch in getBatch(x, 1):
        inputs,targets = batch
        output, hidden = model(inputs, hidden)
        prob = output.exp()
    
        word_id = torch.multinomial(prob, num_samples=1).item()
        word = index2word[word_id]
        print (word)
                
