# Задача 2. Классификация даты документа.
### Михаил Селюгин

Устанавливаем и импортируем необходимые библиотеки

In [None]:
!pip install bigartm

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import artm

from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from google.colab import drive
from bisect import bisect_left

drive.mount('/content/drive')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
#Копируем к себе предоставленные данные по модели

import shutil
shutil.copytree(src='/content/drive/MyDrive/Colab Notebooks/topicmodeling', dst='/content/topicmodel')

## Предобработка данных и установка предобученной модели

In [None]:
data = pd.read_csv('topicmodel/lenta-ru-proccess.csv')
data

In [None]:
data['year'] = data.apply(lambda row: int(row['date'].split('-')[2]), axis=1)

all_years = sorted(data['year'].unique())
print(all_years)

data

In [None]:
file_train = 'topicmodel/vw_data/texts_train.vw.txt'
file_test = 'topicmodel/vw_data/texts_test.vw.txt'

In [None]:
train_mask = np.random.default_rng(1).random(len(data)) < 0.9
split_regexp = re.compile(r'\W+')

year_train = []
year_test = []

with open(file_train, 'w') as ftrain, open(file_test, 'w') as ftest:
    for i in tqdm(range(len(data))):
        text = data.loc[i]['text']
        text = (split_regexp.sub(' ', text).strip()).split()
        if len(text) < 100:
            continue
        text = ' '.join(text)

        year = data.loc[i]['year']
        if train_mask[i]:
            ftrain.write(f'doc_{len(year_train)} {text}\n')
            year_train.append(year)
        else:
            ftest.write(f'doc_{len(year_test)} {text}\n')
            year_test.append(year)

In [None]:
text

Батчи либо формируем, либо берем уже сформированные

In [None]:
#  bv_train = artm.BatchVectorizer(data_path='topicmodel/batches_train',
#                                         data_format='batches')
#  bv_test = artm.BatchVectorizer(data_path='topicmodel/batches_test',
#                                         data_format='batches')

In [None]:
bv_train = artm.BatchVectorizer(data_path=file_train, data_format='vowpal_wabbit', batch_size=10000, 
    target_folder='topicmodel/batches_train')

bv_test = artm.BatchVectorizer(data_path=file_test, data_format='vowpal_wabbit', batch_size=10000, 
    target_folder='topicmodel/batches_test')

In [None]:
dictionary = artm.Dictionary()
dictionary.gather(data_path='topicmodeling/batches_train')

### Инициализация моделей без регуляризации и с ней (выгружаем предобученные)

In [None]:
model_without = artm.load_artm_model('./drive/MyDrive/Colab Notebooks/topicmodeling/model/without_regular/')

model_without.scores.add(artm.PerplexityScore(name='perplexity', dictionary=dictionary), overwrite=True)

In [None]:
model_last = artm.load_artm_model('./drive/MyDrive/Colab Notebooks/topicmodeling/model/last_model/')

model_last.scores.add(artm.PerplexityScore(name='perplexity', dictionary=dictionary), overwrite=True)

### Сравним скоры моделей по perplexity

In [None]:
plt.figure(figsize=(16, 8))
plt.plot(model_without.score_tracker['perplexity'].value, label='model1_perplexity')
plt.plot(model_last.score_tracker['perplexity'].value, label='model2_perplexity')

plt.legend(loc='best')
plt.xlabel('iteration')
plt.ylabel('score')
plt.show()

## Векторизация по темам

In [None]:
def make_dataset(artm: artm.ARTM, bv: artm.BatchVectorizer, years: list):
    values = np.array(artm.transform(bv).values).T
    dataset = []
    for i in range(len(values)):
        year_onehot = np.zeros(len(all_years), dtype='float32')
        year_onehot[bisect_left(all_years, years[i])] = 1.0
        dataset.append((values[i], year_onehot))
    return dataset


In [None]:
# with open('topicmodel/vw_data/y_valid.vw.txt') as f:
#     y_valid = f.read().split('\n')

# with open('topicmodel/vw_data/y_train.vw.txt') as f:
#     y_train = f.read().split('\n')

In [None]:
train_dataset1 = make_dataset(model_without, bv_train, year_train)
test_dataset1 = make_dataset(model_without, bv_test, year_test)

In [None]:
train_dataset2 = make_dataset(model_last, bv_train, year_train)
test_dataset2 = make_dataset(model_last, bv_test, year_test)

In [None]:
# X_train_pd = model_without.transform(bv_train)

# X_train = []
# for i in tqdm(range(len(y_train))):
#     X_train.append(X_train_pd[i].values)

# X_train = np.array(X_train)
# y_train = np.array(y_train)

# from sklearn.utils import shuffle
# X_train, y_train = shuffle(X_train, y_train)
# X_train = X_train[:20000]
# y_train = y_train[:20000]

In [None]:
print("Train data length:", len(train_dataset1), len(train_dataset1))
print("Test data length:", len(test_dataset1), len(test_dataset1))

## Инициализация классификатора (многослойной нейронной сети)

In [None]:
class Model(nn.Module):
    def __init__(
        self, 
        input_size, 
        output_size, 
        hidden_size, 
        n_layers):
        super().__init__()

        modules = []
        modules.append(nn.Linear(input_size, hidden_size))
        modules.append(nn.BatchNorm1d(hidden_size))
        modules.append(nn.ReLU())

        for i in range(n_layers):
            modules.append(nn.Linear(hidden_size, hidden_size))
            modules.append(nn.BatchNorm1d(hidden_size))
            modules.append(nn.ReLU())

        modules.append(nn.Linear(hidden_size, output_size))

        self.layers = nn.ModuleList(modules)

    def forward(self, x: torch.Tensor):
        for m in self.layers:
            x = m(x)
        return x

In [None]:
class Trainer():
    def __init__(self, model, optimizer):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.optimizer = optimizer
        self.criterion = nn.CrossEntropyLoss()

    def train_epoch(self, train_loader, epoch, writer):
        self.model.train()
        loop = tqdm(train_loader)
        for step, (x, y) in enumerate(loop):
            x = x.to(self.device)
            y = y.to(self.device)
            self.optimizer.zero_grad()
            loss = self.criterion(self.model(x), y)
            curr_step = epoch * len(loop) + step
            writer.add_scalar('loss/train', loss, curr_step)
            loop.set_description(f'{epoch}. train_loss: {loss.item():.3f}')
            loss.backward()
            self.optimizer.step()

    @torch.no_grad()
    def validate(self, val_loader, epoch, writer):
        self.model.eval()
        total_loss = 0.0
        loop = tqdm(val_loader)
        for x, y in loop:
            x = x.to(self.device)
            y = y.to(self.device)
            loss = self.criterion(self.model(x), y).item()
            total_loss += loss
            loop.set_description(f'{epoch}. Val_loss: {loss:.3f}')
        total_loss /= len(val_loader)
        writer.add_scalar('loss/valid', loss, epoch)

## Обучение и тестирование

In [None]:
def train(train_dataset,
          test_dataset, 
          n_epochs=10,
          hidden_size=256, 
          n_layers=5,
          writer=None):
    model = Model(input_size=len(train_dataset[0][0]), output_size=len(all_years),
                          hidden_size=hidden_size, n_layers=n_layers)
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

    dl_train = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
    dl_valid = torch.utils.data.DataLoader(test_dataset, batch_size=64)

    trainer = Trainer(model, optimizer)
    for epoch in range(n_epochs):
        trainer.train_epoch(dl_train, epoch, writer)
        trainer.validate(dl_valid, epoch, writer)

In [None]:
writer = SummaryWriter(log_dir='runs')

### Зависимость качества предсказания от модели ARTM (с регуляризацией или без)

In [None]:
train(train_dataset=train_dataset1, test_dataset=test_dataset1, writer=writer)
train(train_dataset=train_dataset2, test_dataset=test_dataset2, writer=writer)

In [None]:
%reload_ext tensorboard
%tensorboard --logdir "runs"

In [None]:
from google.colab import files
!zip -r task2.zip ./runs/
files.download('task2.zip')

## Выводы

Судя по большим значениям лосса, классификация получилась не слишком успешная, как с регуляризацией, так и без нее.

С чем это связано? Вряд ли с классификатором, скорее с искомой artm моделью и особенностями ее работы.