In [2]:
# !pip install tqdm

Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/6c/4b/c38b5144cf167c4f52288517436ccafefe9dc01b8d1c190e18a6b154cd4a/tqdm-4.31.1-py2.py3-none-any.whl (48kB)
[K    100% |████████████████████████████████| 51kB 5.2MB/s ta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.31.1
[33mYou are using pip version 19.0.1, however version 19.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [9]:
import torch.autograd as autograd
from data_process import DataHandle, get_task_data
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import codecs
import random
print('Libraries imported!')

# we fix the seeds to get consistent results
SEED = 234
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

from gensim.models import Word2Vec
def word2vec_embedding(obj, embed_size=50, min_count=1, window=5):
    sentences = obj.tokenized_corpus
    model = Word2Vec(sentences,min_count=min_count, window=window, size=embed_size)
    # model.build_vocab(sentences)  # prepare the model vocabulary
    # train word vectors
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
    # add the first vector as pading
    embed_vectors = np.vstack([np.zeros((1, embed_size)), model.wv.vectors])
    return embed_vectors
def get_model_inputs(tokenized_corpus, word2idx, labels, max_len):
    # we index our sentences
    vectorized_sents = [[word2idx[tok] for tok in sent if tok in word2idx] for sent in tokenized_corpus]
    # we create a tensor of a fixed size filled with zeroes for padding
    sent_tensor = Variable(torch.zeros((len(vectorized_sents), max_len))).long()
    sent_lengths = [len(sent) for sent in vectorized_sents]
    # we fill it with our vectorized sentences
    for idx, (sent, sentlen) in enumerate(zip(vectorized_sents, sent_lengths)):
        sent_tensor[idx, :sentlen] = torch.LongTensor(sent)
    label_tensor = torch.FloatTensor(labels)
    return sent_tensor, label_tensor


class LSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, max_len, num_classes):
        super(LSTM,self).__init__()
        # embedding (lookup layer) layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # hidden layer
#         self.lstm = nn.LSTM(embedding_dim,hidden_dim,1,bidirectional=True,dropout = 0.1)
        self.lstm = nn.LSTM(embedding_dim,hidden_dim,1,bidirectional=True)
        # output layer
        self.hidden_dim=hidden_dim
        self.out = nn.Linear(hidden_dim*2*2, num_classes)
#         self.hidden = self.init_hidden()

    def forward(self, x):
        embedded = self.embedding(x)
#         print(embedded)
        states, hidden = self.lstm(embedded.permute([1, 0, 2]))
        encoding = torch.cat([states[0],states[-1]], dim=1)
        out = self.out(encoding)
#         print(encoding.shape)

#         states=states.permute([1, 0, 2])
#         states=states.reshape(states.shape[0],-1)
#         print(states[20])
#         out = self.out(states[30])
        
        return out
    
def accuracy(output, target):
    predict = torch.round(torch.sigmoid(output))
#     print(predict)
    correct = (predict == target).float()
    acc = correct.sum() / len(correct)
    return acc

def train_lstm(embeddingw,embed_size,train_sent_tensor, train_label_tensor, valid_sent_tensor, valid_label_tensor,
               epochs=10, Vocabulary=0, EMBEDDING_DIM=15, HIDDEN_DIM=8, OUTPUT_DIM=1, max_len=0, lr=0.01, batch=64):

    model = LSTM(EMBEDDING_DIM, HIDDEN_DIM, Vocabulary, max_len, OUTPUT_DIM)
    model.embedding.weight.data.copy_(torch.from_numpy(embeddingw)) #use own embedding
    model.embedding.weight.require_grad = False
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCEWithLogitsLoss()

    num = len(train_label_tensor) // batch
    for epoch in range(1, epochs + 1):
        # to ensure the dropout (exlained later) is "turned on" while training
        # good practice to include even if do not use here
        np.random.seed(SEED)
        p = np.random.permutation(len(train_label_tensor))
        train_sent_tensor, train_label_tensor = train_sent_tensor[p], train_label_tensor[p]
        epoch_loss = 0
        for i in range(num):
            feature = train_sent_tensor[i * batch:(i+1) * batch]
            target = train_label_tensor[i * batch:(i+1) * batch]
#             print(feature)
            model.train()
            # we zero the gradients as they are not removed automatically
            optimizer.zero_grad()
            # queeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
            predictions = model(feature).squeeze(1)
#             print(predictions)
#             print(predictions)
            loss = loss_fn(predictions, target)

            # calculate the gradient of each parameter
            loss.backward()
            # update the parameters using the gradients and optimizer algorithm
            optimizer.step()
            batch_loss = loss.item()
            # print(f'| Epoch: {epoch:02} | Batch: {i: 04} | Train Loss: {batch_loss:.3f}')
        predict = model(train_sent_tensor).squeeze(1)
        predict_val = model(valid_sent_tensor).squeeze(1)
        train_acc = accuracy(predict, train_label_tensor)
        valid_acc = accuracy(predict_val, valid_label_tensor)
#         print(predict_val.shape)
        print(f'Epoch: {epoch: 03} | Train accuracy: {train_acc * 100: .2f}% | Valid acc: {valid_acc * 100: .2f}%')


if __name__ == '__main__':
    import pickle
    with open('data_object.pkl', 'rb') as f:
        obj = pickle.load(f)
        f.close()
#     obj = DataHandle()
    embed_size = 10
    hiddendim=6
    lr=0.02
    epochs=5
    embedding = word2vec_embedding(obj, embed_size=embed_size)
    print('embedding over ...')
    tokenized_corpus = obj.tokenized_corpus
    train, train_labels = get_task_data(obj, train=True, task='a')

    sent_lengths = [len(sent) for sent in tokenized_corpus]
    max_len = np.max(np.array(sent_lengths))

    word2idx = obj.word2idx

    train_sent_tensor, train_label_tensor = get_model_inputs(tokenized_corpus, word2idx, train_labels, max_len)
    print(train_sent_tensor.shape)
    print('lstm embed_size:',embed_size,' hiddendim:',hiddendim)
    train_lstm(embedding,embed_size,train_sent_tensor[:10000], train_label_tensor[:10000], train_sent_tensor[-3000:], train_label_tensor[-3000:],
               epochs=epochs, lr=lr, Vocabulary=len(word2idx),EMBEDDING_DIM=embed_size,HIDDEN_DIM=hiddendim)


Libraries imported!




embedding over ...
---------------Prepare data for task a---------------
---------You are requiring train data!---------
torch.Size([13240, 105])
lstm embed_size: 10  hiddendim: 6
Epoch:  01 | Train accuracy:  66.73% | Valid acc:  66.83%
Epoch:  02 | Train accuracy:  87.62% | Valid acc:  72.37%
Epoch:  03 | Train accuracy:  92.87% | Valid acc:  73.07%
Epoch:  04 | Train accuracy:  94.78% | Valid acc:  74.37%
Epoch:  05 | Train accuracy:  96.30% | Valid acc:  73.40%


In [2]:
import numpy as np
x=np.array([1,2])
y=x
y+=1
print(y,x)


(array([2, 3]), array([2, 3]))
