In [1]:
import numpy as np
import pandas as pd

import tqdm
import pickle
import flake8

from sklearn.metrics import roc_auc_score

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from utils.data import Vocabulary
from utils.mtl.data import CreatorDataset
from utils.mtl.model import Doc2Vec, BiTaskLSTMModel, MultiTaskLossWrapper
from utils.mtl.train import Trainer

In [2]:
%matplotlib inline

%load_ext pycodestyle_magic
%flake8_on

**Загрузка данных**

In [3]:
with open('data/data.pickle', 'rb') as f:
    df = pickle.load(f)

**Инициализация словаря**

In [4]:
corpus = df['actions'].values
targets = df[['target1', 'target2']].values

voc = Vocabulary(max_vocab_size=1000, min_freq=2)
corpus_voc = voc.transform(corpus)
corpus_voc = [doc for doc in corpus_voc if np.any(doc)]

**Инициализация датасета и даталоадера**

In [6]:
train_size = 0.6
test_size = 0.2

idx_train = int(len(corpus_voc) * train_size)
idx_test = int(len(corpus_voc) * train_size) + int(len(corpus_voc) * test_size)

batch_size = 4

ds_train = CreatorDataset(corpus=corpus_voc[:idx_train],
                          targets=targets[:idx_test], maxlen=20)
ds_val = CreatorDataset(corpus=corpus_voc[idx_train:idx_test],
                        targets=targets[idx_train:idx_test], maxlen=20)
ds_test = CreatorDataset(corpus=corpus_voc[idx_test:],
                         targets=targets[idx_test:], maxlen=20)

dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=batch_size, shuffle=True)
dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=False)

**Инициализация модели doc2vec, функции ошибки и оптимизатора**

In [7]:
dim_embeddings = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Doc2Vec(voc.vocab_size, dim_embeddings=dim_embeddings)
model = model.to(device)

criterion = MultiTaskLossWrapper()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

**Обучение модели**

In [8]:
trainer = Trainer(model, criterion, device, learning_rate)
losses = trainer.train(dl_train, dl_val, n_epochs=3, gap=3, verbose=True)

100%|██████████| 15/15 [00:00<00:00, 190.52it/s]
100%|██████████| 15/15 [00:00<00:00, 270.14it/s]
100%|██████████| 15/15 [00:00<00:00, 248.40it/s]


epoch: 0| train loss: 0.6457, test loss: 0.6056
epoch: 1| train loss: 0.6096, test loss: 0.5733
epoch: 2| train loss: 0.5854, test loss: 0.5530


Качество модели

In [9]:
preds = trainer.predict(dl_test)

for i in range(2):
    print('score {} task: {:.4f}'.format(i, roc_auc_score(ds_test.targets[:, i], preds[:, i])))

score 0 task: 0.9286
score 1 task: 0.6400


**Инициализация и обучение модели LSTM на 2 задачи**

In [12]:
model = BiTaskLSTMModel(voc.vocab_size, dim_embeddings=dim_embeddings)
model = model.to(device)

criterion = MultiTaskLossWrapper()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

In [13]:
trainer = Trainer(model, criterion, device, learning_rate)
losses = trainer.train(dl_train, dl_val, n_epochs=3, gap=3, verbose=True)

100%|██████████| 15/15 [00:12<00:00,  1.23it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

epoch: 0| train loss: 0.6859, test loss: 0.6271


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

epoch: 1| train loss: 0.5857, test loss: 0.5292


100%|██████████| 15/15 [00:10<00:00,  1.39it/s]


epoch: 2| train loss: 0.4828, test loss: 0.4414


In [14]:
preds = trainer.predict(dl_test)

for i in range(2):
    print('score {} task: {:.4f}'.format(i, roc_auc_score(ds_test.targets[:, i], preds[:, i])))

score 0 task: 0.7619
score 1 task: 0.6800


**Низкое и нестабильное качество моделей связано с очень малым объемом данных**