In [3]:
import pandas as pd

from collections import Counter

from tqdm import tqdm_notebook as tqdm

import numpy as np

import torch
torch.__version__

import sklearn.model_selection as ms

import torchtext.vocab as vb

from torch.utils import data
from torch import nn
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim

import torch.nn.functional as F
from torch.optim.lr_scheduler import ExponentialLR

from ignite.engine import create_supervised_evaluator, create_supervised_trainer
from ignite.metrics import Accuracy, Precision, Recall, Loss

from ignite.contrib.handlers.tqdm_logger import ProgressBar
from ignite.engine import Events
from ignite.handlers import ModelCheckpoint, EarlyStopping

In [4]:
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

In [5]:
torch.cuda.get_device_properties("cuda")

_CudaDeviceProperties(name='GeForce GTX 1050 Ti with Max-Q Design', major=6, minor=1, total_memory=4042MB, multi_processor_count=6)

In [6]:
dataset = pd.read_hdf("../data/small_data.hdf", "df")

In [7]:
dataset.head(4)

Unnamed: 0,x,y
0,"[[START], Hello]","[üíú, <NOE>]"
1,"[[START], So, yesterday, I, got, my, self, a, ...","[<NOE>, <NOE>, <NOE>, <NOE>, <NOE>, <NOE>, <NO..."
2,"[[START], Lord, bustta, I, greet, you, üèø, üèø]","[<NOE>, <NOE>, <NOE>, <NOE>, <NOE>, üôå, üôå, <NOE>]"
3,"[[START], ADELIN, MADE, ME, YOUR, FRIEND]","[<NOE>, <NOE>, <NOE>, <NOE>, <NOE>, üòî]"


In [8]:
print(f"Totally {dataset.shape[0]} examples in dataset")

Totally 419582 examples in dataset


# Complete vocabualry

In [9]:
token_vocab = set()
emoji_vocab = set()

for row in tqdm(dataset.values):
    token_vocab |= frozenset(row[0])
    emoji_vocab |= frozenset(row[1])

HBox(children=(IntProgress(value=0, max=419582), HTML(value='')))




In [10]:
print(f"Totally {len(emoji_vocab)} emojis and {len(token_vocab)} tokens")

Totally 1161 emojis and 115405 tokens


# Splitting into train, dev and test

In [11]:
dataset = dataset.sample(frac=1.0).reset_index(drop=True)

In [12]:
TEST_SIZE = 0.2
MIN_FREQ = 5
MAX_SIZE_TOKENS = 100000
MAX_SIZE_EMOJIS = 1000
VECTORS = None

In [13]:
train_and_dev, test = ms.train_test_split(dataset, test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [14]:
train, dev = ms.train_test_split(train_and_dev, test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [15]:
print(f"Totally {train.shape[0]} train examples, {dev.shape[0]} dev examples and {test.shape[0]} test examples")

Totally 268532 train examples, 67133 dev examples and 83917 test examples


# Reduced Vocabulary

In [16]:
tokens = Counter()
emojis = Counter()

for row in tqdm(train.values):
    tokens.update(row[0])
    emojis.update(row[1])

HBox(children=(IntProgress(value=0, max=268532), HTML(value='')))




In [17]:
tokens_vocab = vb.Vocab(
    tokens, max_size=MAX_SIZE_TOKENS,
    min_freq=MIN_FREQ, specials=['[START]', '[PAD]'],
    vectors=VECTORS
)
emojis_vocab = vb.Vocab(
    emojis, max_size=MAX_SIZE_EMOJIS,
    min_freq=MIN_FREQ, specials=['<NOE>', '<PAD>'],
    specials_first=True
)

In [18]:
print(f"Totally {len(tokens_vocab)} tokens and {len(emojis_vocab)} emojis")

Totally 20156 tokens and 935 emojis


In [19]:
class PosDataset(data.Dataset):
    def __init__(self, df, tokens_vocab, emojis_vocab, max_len=30):
        super(PosDataset, self).__init__()
        self.examples = df.values
        self.tokens = tokens_vocab
        self.emojis = emojis_vocab
        self.max_len = max_len
        
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        item = self.examples[idx]
        x = item[0]
        y = item[1]
        
        fix_x = torch.ones(self.max_len, dtype=torch.int64)
        fix_y = torch.ones(self.max_len, dtype=torch.int64)
        
        seq_len = np.minimum(len(x), self.max_len)
        for i in range(seq_len):
            fix_x[i] = self.tokens.stoi[x[i]]
            fix_y[i] = self.emojis.stoi[y[i]]
            
        return fix_x, fix_y

In [38]:
def pos2tensor(pos_dataset: PosDataset):
    x_tensors = []
    y_tensors = []
    for i in tqdm(range(len(pos_dataset))):
        #print(i)
        x, y = pos_dataset[i]
        x_tensors.append(x)
        y_tensors.append(y)
        
    x_tensors = torch.stack(x_tensors)
    y_tensors = torch.stack(y_tensors)
    
    return data.TensorDataset(x_tensors, y_tensors)

In [39]:
train_dataset = PosDataset(train, tokens_vocab, emojis_vocab)
dev_dataset   = PosDataset(dev, tokens_vocab, emojis_vocab)
test_dataset  = PosDataset(test, tokens_vocab, emojis_vocab)

In [40]:
tensor_train  = pos2tensor(train_dataset)
tensor_dev    = pos2tensor(dev_dataset)
tensor_test   = pos2tensor(test_dataset)

HBox(children=(IntProgress(value=0, max=268532), HTML(value='')))




HBox(children=(IntProgress(value=0, max=67133), HTML(value='')))




HBox(children=(IntProgress(value=0, max=83917), HTML(value='')))




In [None]:
%%timeit
_ = tensor_train[np.random.randint(0, len(tensor_train), size=1)[0]]

In [None]:
%%timeit
_ = train_dataset[np.random.randint(0, len(tensor_train), size=1)[0]]

In [41]:
class SimplePos(nn.Module):
    def __init__(self, vocab_size, emojis_num, embedding_dim=256, hidden_dim=128, seq_len=30):
        super(SimplePos, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.seq_len = seq_len
        self.emojis_num = emojis_num
        
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=embedding_dim,
            padding_idx=1
        )
        
        self.norm = nn.LayerNorm(embedding_dim)
        self.hidden_layer = nn.Linear(in_features=(seq_len*embedding_dim), out_features=hidden_dim)
        self.output_layer = nn.Linear(in_features=hidden_dim, out_features=(seq_len*emojis_num))
        
        nn.init.kaiming_normal_(self.embedding.weight)
        nn.init.kaiming_normal_(self.hidden_layer.weight)
        nn.init.kaiming_normal_(self.output_layer.weight)
        
    def forward(self, x):
        emb = self.embedding(x)
        emb = self.norm(emb)
        
        hidden = emb.view(-1, self.seq_len*self.embedding_dim)
        hidden = F.leaky_relu(self.hidden_layer(hidden))
        
        output = self.output_layer(hidden).view(-1, self.seq_len, self.emojis_num)
        
        return output

In [42]:
class SequentialCrossEntropy(nn.Module):
    def __init__(self, base_criterion=nn.CrossEntropyLoss(ignore_index=1)):
        super(SequentialCrossEntropy, self).__init__()
        self.criterion = base_criterion
        
    def forward(self, input: torch.Tensor, target: torch.Tensor):
        batch_size = input.size(0)
        seq_len = input.size(1)
        
        return self.criterion(input=input.view(batch_size*seq_len, -1), target=target.view(batch_size*seq_len))

In [149]:
model_params = {
    'embedding_dim': 300,
    'hidden_dim': 128,
    'vocab_size': len(tokens_vocab),
    'emojis_num': len(emojis_vocab),
    'seq_len': 30
}

params = {
    'lr': 0.0001,
    'batch_size': 64,
    'log_interval': 1000
}

writer = SummaryWriter()

device = torch.device('cuda:0')

In [150]:
model: nn.Module = SimplePos(**model_params)

optimizer = optim.Adam(model.parameters(), lr=params['lr'])
criterion = SequentialCrossEntropy()

metrics = {
    'avg_loss': Loss(criterion)
}

train_loader = data.DataLoader(tensor_train, batch_size=params['batch_size'])
valid_loader = data.DataLoader(tensor_dev, batch_size=params['batch_size'])

In [151]:
trainer   = create_supervised_trainer(model, optimizer, criterion, device)
evaluator = create_supervised_evaluator(model, metrics, device=device)

In [152]:
@trainer.on(Events.ITERATION_COMPLETED)
def log_train_loss(engine):
    iteration = (engine.state.iteration - 1) % len(train_loader) + 1
    writer.add_scalar('Train Loss', engine.state.output, global_step=engine.state.iteration)
    if iteration % params['log_interval'] == 0:
        print(f"Epoch[{engine.state.epoch}] " +
              f"Iteration[{iteration}/{len(train_loader)}] " + 
              f" Loss: {engine.state.output:.4f}")

In [153]:
@trainer.on(Events.EPOCH_COMPLETED)
def compute_and_display_val_metrics(engine):
    epoch = engine.state.epoch
    print("Compute validation metrics...")
    metrics = evaluator.run(valid_loader).metrics
    writer.add_scalar('Validation Loss', metrics['avg_loss'], global_step=engine.state.epoch)
    print(f"Validation Results - Epoch: {engine.state.epoch}  Average Loss: {metrics['avg_loss']:.4f}")

In [154]:
lr_scheduler = ExponentialLR(optimizer, gamma=0.8)

@trainer.on(Events.EPOCH_COMPLETED)
def update_lr_scheduler(engine):
    lr_scheduler.step()
    # –í—ã–≤–æ–¥ –∑–Ω–∞—á–µ–Ω–∏–π —Å–∫–æ—Ä–æ—Å—Ç–∏ –æ–±—É—á–µ–Ω–∏—è:
    lr = float(optimizer.param_groups[0]['lr'])
    writer.add_scalar("Learning Rate", lr, global_step=engine.state.epoch)
    print("Learning rate: {}".format(lr)) 

In [155]:
def score_function(engine):
    val_avg_loss = engine.state.metrics['avg_loss']
    return val_avg_loss

In [156]:
best_model_saver = ModelCheckpoint(
    "../models/simple-pos",  
    filename_prefix="simple-pos",
    score_name="val_loss",  
    score_function=score_function,
    n_saved=3,
    save_as_state_dict=True,
    create_dir=True,
    require_empty=False
)

In [157]:
evaluator.add_event_handler(
    Events.COMPLETED,
    best_model_saver, 
    {"best_model": model}
)

In [158]:
training_saver = ModelCheckpoint(
    "../models/checkpoints",
    filename_prefix="simplepos",
    save_interval=1000,
    n_saved=1,
    save_as_state_dict=True,
    create_dir=True,
    require_empty=False
)

In [159]:
to_save = {"model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} 
trainer.add_event_handler(Events.ITERATION_COMPLETED, training_saver, to_save)

In [160]:
early_stopping = EarlyStopping(
    patience=10, 
    score_function=score_function, 
    trainer=trainer
)
evaluator.add_event_handler(Events.EPOCH_COMPLETED, early_stopping)

In [161]:
max_epochs = 100
output = trainer.run(train_loader, max_epochs=max_epochs)

Epoch[1] Iteration[1000/4196]  Loss: 0.9095
Epoch[1] Iteration[2000/4196]  Loss: 0.6344
Epoch[1] Iteration[3000/4196]  Loss: 0.6530
Epoch[1] Iteration[4000/4196]  Loss: 0.5822
Compute validation metrics...
Validation Results - Epoch: 1  Average Loss: 0.6169
Learning rate: 8e-05
Epoch[2] Iteration[1000/4196]  Loss: 0.6649
Epoch[2] Iteration[2000/4196]  Loss: 0.5486
Epoch[2] Iteration[3000/4196]  Loss: 0.5553
Epoch[2] Iteration[4000/4196]  Loss: 0.5080
Compute validation metrics...
Validation Results - Epoch: 2  Average Loss: 0.5782
Learning rate: 6.400000000000001e-05
Epoch[3] Iteration[1000/4196]  Loss: 0.6007
Epoch[3] Iteration[2000/4196]  Loss: 0.5050
Epoch[3] Iteration[3000/4196]  Loss: 0.4964
Epoch[3] Iteration[4000/4196]  Loss: 0.4685
Compute validation metrics...
Validation Results - Epoch: 3  Average Loss: 0.5593
Learning rate: 5.120000000000001e-05
Epoch[4] Iteration[1000/4196]  Loss: 0.5519
Epoch[4] Iteration[2000/4196]  Loss: 0.4732
Epoch[4] Iteration[3000/4196]  Loss: 0.4490

ERROR:ignite.engine.engine.Engine:Current run is terminating due to exception: .
ERROR:ignite.engine.engine.Engine:Engine run is terminating due to exception: .


KeyboardInterrupt: 