ivan.smurov@abbyy.com

Дедлайн: утро 9 марта


# ДЗ #1

### Подготовка данных

In [1]:
%%capture
!pip install razdel
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!gzip -d lenta-ru-news.csv.gz
!head -n 2 lenta-ru-news.csv

In [2]:
import pandas as pd
import re
import datetime as dt
from razdel import tokenize, sentenize
from string import punctuation

def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)

dataset = pd.read_csv("lenta-ru-news.csv", sep=',', quotechar='\"', escapechar='\\', encoding='utf-8', header=0)
dataset["date"] = dataset["url"].apply(lambda x: dt.datetime.strptime(get_date(x), "%Y/%m/%d"))
dataset = dataset[dataset["date"] > "2017-01-01"]
dataset["text"] = dataset["text"].apply(lambda x: x.replace("\xa0", " "))
dataset["title"] = dataset["title"].apply(lambda x: x.replace("\xa0", " "))
train_dataset = dataset[dataset["date"] < "2018-04-01"]
test_dataset = dataset[dataset["date"] > "2018-04-01"]

In [3]:
def get_texts(dataset):
    texts = []
    for text in dataset["text"]:
        for sentence in sentenize(text):
            texts.append([token.text.lower() for token in tokenize(sentence.text) if token.text not in punctuation])
    
    for title in dataset["title"]:
        texts.append([token.text.lower() for token in tokenize(title) if token.text not in punctuation])
    return texts

In [4]:
texts = get_texts(train_dataset)
test_texts = get_texts(test_dataset)

In [5]:
assert len(texts) == 827217
assert len(texts[0]) > 0
assert texts[0][0].islower()

In [6]:
from collections import Counter


class Vocabulary:
    def __init__(self):
        self.word2index = {
            "<unk>": 0
        }
        self.index2word = ["<unk>"]

    def build(self, texts, min_count=10):
        words_counter = Counter(token for tokens in texts for token in tokens)
        for word, count in words_counter.most_common():
            if count >= min_count:
                self.word2index[word] = len(self.word2index)
        self.index2word = [word for word, _ in sorted(self.word2index.items(), key=lambda x: x[1])]
    
    @property
    def size(self):
        return len(self.index2word)
    
    def top(self, n=100):
        return self.index2word[1:n+1]
    
    def get_index(self, word):
        return self.word2index.get(word, 0)
    
    def get_word(self, index):
        return self.index2word[index]

In [7]:
vocabulary = Vocabulary()
vocabulary.build(texts)

In [8]:
assert vocabulary.word2index[vocabulary.index2word[10]] == 10

In [9]:
def build_contexts(tokenized_texts, vocabulary, window_size):
    contexts = []
    for tokens in tokenized_texts:
        for i in range(len(tokens)):
            central_word = vocabulary.get_index(tokens[i])
            context = [vocabulary.get_index(tokens[i + delta]) for delta in range(-window_size, window_size + 1) 
                       if delta != 0 and i + delta >= 0 and i + delta < len(tokens)]
            if len(context) != 2 * window_size:
                continue

            contexts.append((central_word, context))
            
    return contexts

In [10]:
contexts = build_contexts(texts, vocabulary, window_size=2)

## Задание 1: Самописный CBoW

Генератор батчей: оставляем такой же как и для skip-gramm

In [11]:
import random
import numpy as np
import torch

def get_next_batch(contexts, window_size, batch_size, epochs_count):
    assert batch_size % (window_size * 2) == 0
    central_words, contexts = zip(*contexts)
    batch_size //= (window_size * 2)
    
    for epoch in range(epochs_count):
        # сколько всего контекстов
        indices = np.arange(len(contexts))
        # в каждой эпохе перемешиваем индексы
        np.random.shuffle(indices)
        batch_begin = 0
        while batch_begin < len(contexts):
            # индексы 
            batch_indices = indices[batch_begin: batch_begin + batch_size]
            batch_contexts, batch_centrals = [], []
            for data_ind in batch_indices:
                # получаем набор данных для каждого центрального слова в батче
                central_word, context = central_words[data_ind], contexts[data_ind]
                batch_contexts.extend(context)
                batch_centrals.extend([central_word] * len(context))
                
            batch_begin += batch_size
            yield torch.cuda.LongTensor(batch_contexts), torch.cuda.LongTensor(batch_centrals)

Модель: архитектура как и в skip-gramm

In [12]:
import torch.nn as nn
import torch.optim as optim 
import time

class CBoWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32):
        super().__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        projections = self.embeddings.forward(inputs)
        output = self.out_layer.forward(projections)
        return output

In [13]:
model = CBoWModel(vocabulary.size, 32)

In [14]:
device = torch.device("cuda")

In [15]:
model = model.to(device)

In [16]:
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [17]:
loss_function = nn.CrossEntropyLoss().cuda()

In [18]:
def fit_model(model, contexts, loss_function, optimizer, loss_every_nsteps=1000):
    total_loss = 0

    start_time = time.time()

    for step, (batch_contexts, batch_centrals) in enumerate(get_next_batch(contexts, window_size=2, batch_size=256, epochs_count=10)):
        logits = model(batch_contexts)
        loss = loss_function(logits, batch_centrals)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        if step != 0 and step % loss_every_nsteps == 0:
            print("Step = {}, Avg Loss = {:.4f}, Time = {:.2f}s".format(step, total_loss / loss_every_nsteps, time.time() - start_time))
            total_loss = 0
            start_time = time.time()

In [19]:
# fit_model(model, contexts, loss_function, optimizer)

In [20]:
# embeddings = model.embeddings.weight.cpu().data.numpy()

In [21]:
!wget https://github.com/shitkov/courses/raw/master/sber_nlp_course/embeddings_cbow_v0.npy

--2022-02-22 12:32:48--  https://github.com/shitkov/courses/raw/master/sber_nlp_course/embeddings_cbow_v0.npy
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/shitkov/courses/master/sber_nlp_course/embeddings_cbow_v0.npy [following]
--2022-02-22 12:32:49--  https://raw.githubusercontent.com/shitkov/courses/master/sber_nlp_course/embeddings_cbow_v0.npy
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9111936 (8.7M) [application/octet-stream]
Saving to: ‘embeddings_cbow_v0.npy’


2022-02-22 12:32:49 (138 MB/s) - ‘embeddings_cbow_v0.npy’ saved [9111936/9111936]



In [22]:
import numpy as np
embeddings = np.load('/content/embeddings_cbow_v0.npy')

In [23]:
embeddings.shape

(71186, 32)

## Базовые проверки

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def most_similar(embeddings, vocabulary, word):
    word_emb = embeddings[vocabulary.get_index(word)]
    
    similarities = cosine_similarity([word_emb], embeddings)[0]
    top10 = np.argsort(similarities)[-10:]
    
    return [vocabulary.get_word(index) for index in reversed(top10)]

most_similar(embeddings, vocabulary, 'путин')

['путин',
 'омелян',
 'мединский',
 'чижов',
 'кожин',
 'жириновский',
 'колычев',
 'городецкий',
 'президент',
 'тюрин']

In [25]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    output_notebook()
    
    if isinstance(color, str): 
        color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: 
        pl.show(fig)
    return fig


def get_tsne_projection(word_vectors):
    tsne = TSNE(n_components=2)
    return scale(tsne.fit_transform(word_vectors))

def get_pca_projection(word_vectors):
    pca = PCA(n_components=2)
    return scale(pca.fit_transform(word_vectors))
    
    
def visualize_embeddings(embeddings, vocabulary, word_count, method="pca"):
    word_vectors = embeddings[1: word_count + 1]
    words = vocabulary.top(word_count)
    get_projections = get_pca_projection if method == "pca" else get_tsne_projection
    projections = get_projections(word_vectors)
    draw_vectors(projections[:, 0], projections[:, 1], color='green', token=words)

In [26]:
visualize_embeddings(embeddings, vocabulary, 500, method="tsne")

  "Numerical issues were encountered "
  "Numerical issues were encountered "


## Задача рубрикации

In [27]:
def get_text_embedding(embeddings, vocabulary, phrase):
    embeddings = np.array([embeddings[vocabulary.get_index(word.text.lower())] for word in tokenize(phrase)])
    return np.mean(embeddings, axis=0)

target_labels = set(train_dataset["topic"].dropna().tolist())
target_labels -= {"69-я параллель", "Крым", "Культпросвет ", "Оружие", "Бизнес", "Путешествия"}
target_labels = list(target_labels)
print(target_labels)

pattern = r'(\b{}\b)'.format('|'.join(target_labels))

train_with_topics = train_dataset[train_dataset["topic"].str.contains(pattern, case=False, na=False)]
train_with_topics = train_with_topics.head(20000)

test_with_topics = test_dataset[test_dataset["topic"].str.contains(pattern, case=False, na=False)]

y_train = train_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_train = np.zeros((train_with_topics.shape[0], embeddings.shape[1]))
for i, embedding in enumerate(train_with_topics["text"]):
    X_train[i, :] = get_text_embedding(embeddings, vocabulary, embedding)

y_test = test_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_test = np.zeros((test_with_topics.shape[0], embeddings.shape[1]))
for i, embedding in enumerate(test_with_topics["text"]):
    X_test[i, :] = get_text_embedding(embeddings, vocabulary, embedding)

print(X_train.shape)
print(y_train)

  if sys.path[0] == '':
  from ipykernel import kernelapp as app


['Дом', 'Россия', 'Ценности', 'Силовые структуры', 'Спорт', 'Культура', 'Мир', 'Наука и техника', 'Экономика', 'Бывший СССР', 'Из жизни', 'Интернет и СМИ']
(20000, 32)
[ 6  6  5 ... 11  8  2]


In [28]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

clf = MLPClassifier()
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.67      0.72      0.70      1182
           1       0.63      0.63      0.63      4324
           2       0.82      0.76      0.79      1177
           3       0.66      0.63      0.64      1663
           4       0.92      0.91      0.92      3429
           5       0.78      0.67      0.72      1995
           6       0.72      0.80      0.76      4291
           7       0.84      0.85      0.84      2119
           8       0.73      0.80      0.77      3185
           9       0.69      0.62      0.65      2156
          10       0.74      0.72      0.73      2191
          11       0.66      0.60      0.63      2447

    accuracy                           0.74     30159
   macro avg       0.74      0.73      0.73     30159
weighted avg       0.74      0.74      0.74     30159





## Задание 2: Negative Sampling

* 0) 1 - слова из контекста, 0 - случайные слова из словаря согласно unigram распределению в степени alpha, alpha=0.75
* 1) Linear -> Embedding
* 2) Second embedding layer apply to context word
* 3) Dot product emb1 and emb2 -> scalar (а раньше был * вектор размерности словая)
* 4) CrossEntropyLoss -> BCELoss
* 5) Triplet loss: (pivot, positive, negative): pivot * positive - pivot * negative


Реализуйте negative sampling вместо полного softmax'а

### Get distribution

In [29]:
words_list = [token for tokens in texts for token in tokens if token in vocabulary.word2index.keys()]

In [30]:
cntr = Counter(words_list)

In [31]:
words = cntr.most_common()

In [32]:
from tqdm import tqdm
unigram_dict = {}
for word, qnt in tqdm(words):
    unigram_dict[word] = float(qnt/len(words_list))

100%|██████████| 71185/71185 [00:00<00:00, 1079282.13it/s]


In [33]:
noise_dist = {key: val ** (3/4) for key, val in unigram_dict.items()}

In [34]:
Z = sum(noise_dist.values())

In [35]:
noise_dist_normalized = {key: val / Z for key, val in noise_dist.items()}

In [36]:
noise_dist_normalized['<unk>'] = 0

In [37]:
index2prob = {}
for word in vocabulary.word2index.keys():
    ind = vocabulary.word2index[word]
    index2prob[ind] = noise_dist_normalized[word]

### Batch generator

In [38]:
import random
import numpy as np
import torch

def get_next_batch(contexts, index2prob, window_size, batch_size, epochs_count):
    assert batch_size % (window_size * 2 * 2) == 0
    central_words, contexts = zip(*contexts)
    batch_size //= (window_size * 2 * 2)
    
    for epoch in range(epochs_count):
        # сколько всего контекстов
        indices = np.arange(len(contexts))
        # в каждой эпохе перемешиваем индексы
        np.random.shuffle(indices)
        batch_begin = 0
        while batch_begin < len(contexts):
            # индексы 
            batch_indices = indices[batch_begin: batch_begin + batch_size]
            batch_contexts, batch_centrals, batch_labels = [], [], []
            for data_ind in batch_indices:
                # получаем набор данных для каждого центрального слова в батче
                central_word, context = central_words[data_ind], contexts[data_ind]
                neg_semples = np.random.choice(list(index2prob.keys()), size=len(context), p=list(index2prob.values()))
                batch_contexts.extend(context)
                batch_contexts.extend(neg_semples)
                batch_centrals.extend([central_word] * len(context) * 2)
                batch_labels.extend([1] * len(context))
                batch_labels.extend([0] * len(context))

            batch_begin += batch_size
            out = {
                'centrals': torch.cuda.LongTensor(batch_centrals),
                'contexts': torch.cuda.LongTensor(batch_contexts),
                'labels': torch.cuda.FloatTensor(batch_labels)
            }
            yield out

### Model

In [39]:
import torch.nn as nn
import torch.optim as optim 
import time

class NegSempModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32):
        super().__init__()
        
        self.embeddings_word = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs):
        projections_word = self.embeddings_word.forward(inputs['centrals'])
        projections_contexts = self.embeddings_context.forward(inputs['contexts'])
        output = torch.einsum('bs,bs->b', projections_word, projections_contexts)
        output = self.sigmoid(output)
        return torch.cuda.FloatTensor(output)

In [40]:
model = NegSempModel(vocabulary.size, 32)

In [41]:
device = torch.device("cuda")

In [42]:
model = model.to(device)

In [43]:
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [44]:
loss_function = nn.BCELoss().cuda()

In [45]:
from tqdm import tqdm
def fit_model(model, contexts, index2prob, loss_function, optimizer, loss_every_nsteps=1000):
    total_loss = 0

    start_time = time.time()

    for step, inputs in tqdm(enumerate(get_next_batch(contexts, index2prob, window_size=2, batch_size=256, epochs_count=1))):
        logits = model(inputs)
        loss = loss_function(logits, inputs['labels'])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        if step != 0 and step % loss_every_nsteps == 0:
            print("Step = {}, Avg Loss = {:.4f}, Time = {:.2f}s".format(step, total_loss / loss_every_nsteps, time.time() - start_time))
            total_loss = 0
            start_time = time.time()

In [46]:
fit_model(model, contexts, index2prob, loss_function, optimizer)

Step = 1000, Avg Loss = 1.8214, Time = 278.44s
Step = 2000, Avg Loss = 1.4198, Time = 270.30s
Step = 3000, Avg Loss = 1.2960, Time = 271.21s
Step = 4000, Avg Loss = 1.2203, Time = 269.81s
Step = 5000, Avg Loss = 1.1639, Time = 269.66s
Step = 6000, Avg Loss = 1.1323, Time = 269.92s
Step = 7000, Avg Loss = 1.1077, Time = 271.70s
Step = 8000, Avg Loss = 1.0701, Time = 269.16s
Step = 9000, Avg Loss = 1.0582, Time = 268.59s
Step = 10000, Avg Loss = 1.0332, Time = 268.27s
Step = 11000, Avg Loss = 1.0265, Time = 270.23s
Step = 12000, Avg Loss = 1.0116, Time = 277.07s
Step = 13000, Avg Loss = 0.9882, Time = 268.91s
Step = 14000, Avg Loss = 0.9896, Time = 269.09s
Step = 15000, Avg Loss = 0.9834, Time = 268.62s
Step = 16000, Avg Loss = 0.9660, Time = 268.65s
Step = 17000, Avg Loss = 0.9653, Time = 268.68s
Step = 18000, Avg Loss = 0.9589, Time = 269.16s
Step = 19000, Avg Loss = 0.9565, Time = 269.59s
Step = 20000, Avg Loss = 0.9497, Time = 269.00s
Step = 21000, Avg Loss = 0.9414, Time = 269.55s
S

In [47]:
embeddings = model.embeddings_word.weight.cpu().data.numpy()

In [48]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def most_similar(embeddings, vocabulary, word):
    word_emb = embeddings[vocabulary.get_index(word)]
    
    similarities = cosine_similarity([word_emb], embeddings)[0]
    top10 = np.argsort(similarities)[-10:]
    
    return [vocabulary.get_word(index) for index in reversed(top10)]

most_similar(embeddings, vocabulary, 'путин')

['путин',
 'пообещал',
 'нурсултан',
 'мединский',
 'премьер',
 'путину',
 'осин',
 'станислав',
 'онищенко',
 'призвал']

In [49]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    output_notebook()
    
    if isinstance(color, str): 
        color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: 
        pl.show(fig)
    return fig


def get_tsne_projection(word_vectors):
    tsne = TSNE(n_components=2)
    return scale(tsne.fit_transform(word_vectors))

def get_pca_projection(word_vectors):
    pca = PCA(n_components=2)
    return scale(pca.fit_transform(word_vectors))
    
    
def visualize_embeddings(embeddings, vocabulary, word_count, method="pca"):
    word_vectors = embeddings[1: word_count + 1]
    words = vocabulary.top(word_count)
    get_projections = get_pca_projection if method == "pca" else get_tsne_projection
    projections = get_projections(word_vectors)
    draw_vectors(projections[:, 0], projections[:, 1], color='green', token=words)

In [50]:
visualize_embeddings(embeddings, vocabulary, 500, method="tsne")

  "Numerical issues were encountered "


## Задача рубрикации

In [51]:
def get_text_embedding(embeddings, vocabulary, phrase):
    embeddings = np.array([embeddings[vocabulary.get_index(word.text.lower())] for word in tokenize(phrase)])
    return np.mean(embeddings, axis=0)

target_labels = set(train_dataset["topic"].dropna().tolist())
target_labels -= {"69-я параллель", "Крым", "Культпросвет ", "Оружие", "Бизнес", "Путешествия"}
target_labels = list(target_labels)
print(target_labels)

pattern = r'(\b{}\b)'.format('|'.join(target_labels))

train_with_topics = train_dataset[train_dataset["topic"].str.contains(pattern, case=False, na=False)]
train_with_topics = train_with_topics.head(20000)

test_with_topics = test_dataset[test_dataset["topic"].str.contains(pattern, case=False, na=False)]

y_train = train_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_train = np.zeros((train_with_topics.shape[0], embeddings.shape[1]))
for i, embedding in enumerate(train_with_topics["text"]):
    X_train[i, :] = get_text_embedding(embeddings, vocabulary, embedding)

y_test = test_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_test = np.zeros((test_with_topics.shape[0], embeddings.shape[1]))
for i, embedding in enumerate(test_with_topics["text"]):
    X_test[i, :] = get_text_embedding(embeddings, vocabulary, embedding)

print(X_train.shape)
print(y_train)

  if sys.path[0] == '':


['Дом', 'Россия', 'Ценности', 'Силовые структуры', 'Спорт', 'Культура', 'Мир', 'Наука и техника', 'Экономика', 'Бывший СССР', 'Из жизни', 'Интернет и СМИ']


  from ipykernel import kernelapp as app


(20000, 32)
[ 6  6  5 ... 11  8  2]


In [52]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

clf = MLPClassifier()
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.56      0.62      0.59      1182
           1       0.54      0.58      0.56      4324
           2       0.79      0.74      0.77      1177
           3       0.63      0.57      0.60      1663
           4       0.91      0.91      0.91      3429
           5       0.70      0.66      0.68      1995
           6       0.63      0.70      0.66      4291
           7       0.81      0.81      0.81      2119
           8       0.69      0.75      0.72      3185
           9       0.59      0.44      0.51      2156
          10       0.70      0.69      0.69      2191
          11       0.63      0.55      0.59      2447

    accuracy                           0.68     30159
   macro avg       0.68      0.67      0.67     30159
weighted avg       0.68      0.68      0.67     30159





### Triplet Loss

https://aegis4048.github.io/optimize_computational_efficiency_of_skip-gram_with_negative_sampling

In [None]:
%%capture
!pip install razdel
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!gzip -d lenta-ru-news.csv.gz
!head -n 2 lenta-ru-news.csv

In [None]:
import pandas as pd
import re
import datetime as dt
from razdel import tokenize, sentenize
from string import punctuation

def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)

dataset = pd.read_csv("lenta-ru-news.csv", sep=',', quotechar='\"', escapechar='\\', encoding='utf-8', header=0)
dataset["date"] = dataset["url"].apply(lambda x: dt.datetime.strptime(get_date(x), "%Y/%m/%d"))
dataset = dataset[dataset["date"] > "2017-01-01"]
dataset["text"] = dataset["text"].apply(lambda x: x.replace("\xa0", " "))
dataset["title"] = dataset["title"].apply(lambda x: x.replace("\xa0", " "))
train_dataset = dataset[dataset["date"] < "2018-04-01"]
test_dataset = dataset[dataset["date"] > "2018-04-01"]

In [None]:
def get_texts(dataset):
    texts = []
    for text in dataset["text"]:
        for sentence in sentenize(text):
            texts.append([token.text.lower() for token in tokenize(sentence.text) if token.text not in punctuation])
    
    for title in dataset["title"]:
        texts.append([token.text.lower() for token in tokenize(title) if token.text not in punctuation])
    return texts

In [None]:
texts = get_texts(train_dataset)
test_texts = get_texts(test_dataset)

In [None]:
assert len(texts) == 827217
assert len(texts[0]) > 0
assert texts[0][0].islower()

In [None]:
from collections import Counter


class Vocabulary:
    def __init__(self):
        self.word2index = {
            "<unk>": 0
        }
        self.index2word = ["<unk>"]

    def build(self, texts, min_count=10):
        words_counter = Counter(token for tokens in texts for token in tokens)
        for word, count in words_counter.most_common():
            if count >= min_count:
                self.word2index[word] = len(self.word2index)
        self.index2word = [word for word, _ in sorted(self.word2index.items(), key=lambda x: x[1])]
    
    @property
    def size(self):
        return len(self.index2word)
    
    def top(self, n=100):
        return self.index2word[1:n+1]
    
    def get_index(self, word):
        return self.word2index.get(word, 0)
    
    def get_word(self, index):
        return self.index2word[index]

In [None]:
vocabulary = Vocabulary()
vocabulary.build(texts)

In [None]:
assert vocabulary.word2index[vocabulary.index2word[10]] == 10

In [None]:
def build_contexts(tokenized_texts, vocabulary, window_size):
    contexts = []
    for tokens in tokenized_texts:
        for i in range(len(tokens)):
            central_word = vocabulary.get_index(tokens[i])
            context = [vocabulary.get_index(tokens[i + delta]) for delta in range(-window_size, window_size + 1) 
                       if delta != 0 and i + delta >= 0 and i + delta < len(tokens)]
            if len(context) != 2 * window_size:
                continue

            contexts.append((central_word, context))
            
    return contexts

In [None]:
contexts = build_contexts(texts, vocabulary, window_size=2)

### Get distribution

In [None]:
words_list = [token for tokens in texts for token in tokens if token in vocabulary.word2index.keys()]

In [None]:
cntr = Counter(words_list)

In [None]:
words = cntr.most_common()

In [None]:
from tqdm import tqdm
unigram_dict = {}
for word, qnt in tqdm(words):
    unigram_dict[word] = float(qnt/len(words_list))

100%|██████████| 71185/71185 [00:00<00:00, 943148.35it/s]


In [None]:
noise_dist = {key: val ** (3/4) for key, val in unigram_dict.items()}

In [None]:
Z = sum(noise_dist.values())

In [None]:
noise_dist_normalized = {key: val / Z for key, val in noise_dist.items()}

In [None]:
noise_dist_normalized['<unk>'] = 0

In [None]:
index2prob = {}
for word in vocabulary.word2index.keys():
    ind = vocabulary.word2index[word]
    index2prob[ind] = noise_dist_normalized[word]

### Batch generator

In [None]:
import random
import numpy as np
import torch

def get_next_batch(contexts, index2prob, window_size, batch_size, epochs_count):
    assert batch_size % (window_size * 2) == 0
    central_words, contexts = zip(*contexts)
    batch_size //= (window_size * 2)
    
    for epoch in range(epochs_count):
        # сколько всего контекстов
        indices = np.arange(len(contexts))
        # в каждой эпохе перемешиваем индексы
        np.random.shuffle(indices)
        batch_begin = 0
        while batch_begin < len(contexts):
            # индексы 
            batch_indices = indices[batch_begin: batch_begin + batch_size]
            batch_pivot, batch_positive, batch_negative = [], [], []
            for data_ind in batch_indices:
                # получаем набор данных для каждого центрального слова в батче
                central_word, context = central_words[data_ind], contexts[data_ind]
                neg_semples = np.random.choice(list(index2prob.keys()), size=len(context), p=list(index2prob.values()))
                batch_pivot.extend([central_word] * len(context))
                batch_positive.extend(context)
                batch_negative.extend(neg_semples)

            batch_begin += batch_size
            out = {
                'pivot': torch.cuda.LongTensor(batch_pivot),
                'positive': torch.cuda.LongTensor(batch_positive),
                'negative': torch.cuda.LongTensor(batch_negative)
            }
            yield out

### Model: Triplet loss: (pivot, positive, negative): pivot * positive - pivot * negative

In [None]:
import torch.nn as nn
import torch.optim as optim 
import time

class TripletLossModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32):
        super().__init__()
        
        self.pivot = nn.Embedding(vocab_size, embedding_dim)
        self.positive = nn.Embedding(vocab_size, embedding_dim)
        self.negative = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, inputs):
        pivot = self.pivot.forward(inputs['pivot'])
        positive = self.positive.forward(inputs['positive'])
        negative = self.negative.forward(inputs['negative'])
        return pivot, positive, negative

In [None]:
model = TripletLossModel(vocabulary.size, 32)

In [None]:
device = torch.device("cuda")

In [None]:
model = model.to(device)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
class TripletLoss(nn.Module):

    def __init__(self):
        super(TripletLoss, self).__init__()

    def forward(self, outputs):
        pivot = outputs[0]
        positive = outputs[1]
        negative = outputs[2]
        loss = torch.einsum('bs,bs->b', pivot, positive) - torch.einsum('bs,bs->b', pivot, negative)
        return loss.sum()

In [None]:
loss_function = TripletLoss().cuda()

In [None]:
from tqdm import tqdm
def fit_model(model, contexts, index2prob, optimizer, loss_function, loss_every_nsteps=1000):
    total_loss = 0

    start_time = time.time()

    for step, inputs in enumerate(get_next_batch(contexts, index2prob, window_size=2, batch_size=256, epochs_count=1)):
        outputs = model(inputs)
        loss = loss_function(outputs)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        if step != 0 and step % loss_every_nsteps == 0:
            print("Step = {}, Avg Loss = {:.4f}, Time = {:.2f}s".format(step, total_loss / loss_every_nsteps, time.time() - start_time))
            total_loss = 0
            start_time = time.time()

In [None]:
fit_model(model, contexts, index2prob, optimizer, loss_function)

Step = 1000, Avg Loss = -96401.9371, Time = 555.42s
Step = 2000, Avg Loss = -525037.4133, Time = 547.47s
Step = 3000, Avg Loss = -1219996.7159, Time = 541.64s
Step = 4000, Avg Loss = -2156766.6298, Time = 541.07s
Step = 5000, Avg Loss = -3289714.3443, Time = 540.30s
Step = 6000, Avg Loss = -4578881.3175, Time = 543.31s
Step = 7000, Avg Loss = -6033470.2600, Time = 542.27s
Step = 8000, Avg Loss = -7841358.4650, Time = 542.86s
Step = 9000, Avg Loss = -9516923.7075, Time = 544.80s
Step = 10000, Avg Loss = -11780094.8565, Time = 543.50s
Step = 11000, Avg Loss = -13942309.7185, Time = 542.90s
Step = 12000, Avg Loss = -16474594.6960, Time = 542.84s
Step = 13000, Avg Loss = -18955930.3010, Time = 546.62s
Step = 14000, Avg Loss = -21407115.9290, Time = 546.03s
Step = 15000, Avg Loss = -24283883.1990, Time = 539.63s
Step = 16000, Avg Loss = -27342516.8030, Time = 540.23s
Step = 17000, Avg Loss = -30729220.1760, Time = 540.31s
Step = 18000, Avg Loss = -34013342.9180, Time = 540.90s
Step = 19000,

KeyboardInterrupt: ignored

In [None]:
embeddings = model.pivot.weight.cpu().data.numpy()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def most_similar(embeddings, vocabulary, word):
    word_emb = embeddings[vocabulary.get_index(word)]
    
    similarities = cosine_similarity([word_emb], embeddings)[0]
    top10 = np.argsort(similarities)[-10:]
    
    return [vocabulary.get_word(index) for index in reversed(top10)]

most_similar(embeddings, vocabulary, 'путин')

['путин', 'было', 'в', 'пятницу', 'он', 'и', 'россии', 'как', '—', 'еще']

In [None]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    output_notebook()
    
    if isinstance(color, str): 
        color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: 
        pl.show(fig)
    return fig


def get_tsne_projection(word_vectors):
    tsne = TSNE(n_components=2)
    return scale(tsne.fit_transform(word_vectors))

def get_pca_projection(word_vectors):
    pca = PCA(n_components=2)
    return scale(pca.fit_transform(word_vectors))
    
    
def visualize_embeddings(embeddings, vocabulary, word_count, method="pca"):
    word_vectors = embeddings[1: word_count + 1]
    words = vocabulary.top(word_count)
    get_projections = get_pca_projection if method == "pca" else get_tsne_projection
    projections = get_projections(word_vectors)
    draw_vectors(projections[:, 0], projections[:, 1], color='green', token=words)

In [None]:
visualize_embeddings(embeddings, vocabulary, 500, method="tsne")

  "Numerical issues were encountered "
  "Numerical issues were encountered "


## Задача рубрикации

In [None]:
def get_text_embedding(embeddings, vocabulary, phrase):
    embeddings = np.array([embeddings[vocabulary.get_index(word.text.lower())] for word in tokenize(phrase)])
    return np.mean(embeddings, axis=0)

target_labels = set(train_dataset["topic"].dropna().tolist())
target_labels -= {"69-я параллель", "Крым", "Культпросвет ", "Оружие", "Бизнес", "Путешествия"}
target_labels = list(target_labels)
print(target_labels)

pattern = r'(\b{}\b)'.format('|'.join(target_labels))

train_with_topics = train_dataset[train_dataset["topic"].str.contains(pattern, case=False, na=False)]
train_with_topics = train_with_topics.head(20000)

test_with_topics = test_dataset[test_dataset["topic"].str.contains(pattern, case=False, na=False)]

y_train = train_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_train = np.zeros((train_with_topics.shape[0], embeddings.shape[1]))
for i, embedding in enumerate(train_with_topics["text"]):
    X_train[i, :] = get_text_embedding(embeddings, vocabulary, embedding)

y_test = test_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_test = np.zeros((test_with_topics.shape[0], embeddings.shape[1]))
for i, embedding in enumerate(test_with_topics["text"]):
    X_test[i, :] = get_text_embedding(embeddings, vocabulary, embedding)

print(X_train.shape)
print(y_train)

  if sys.path[0] == '':
  from ipykernel import kernelapp as app


['Культура', 'Россия', 'Мир', 'Экономика', 'Ценности', 'Интернет и СМИ', 'Силовые структуры', 'Спорт', 'Дом', 'Из жизни', 'Наука и техника', 'Бывший СССР']
(20000, 32)
[2 2 0 ... 5 3 4]


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

clf = MLPClassifier()
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1995
           1       0.00      0.00      0.00      4324
           2       0.00      0.00      0.00      4291
           3       0.00      0.00      0.00      3185
           4       0.00      0.00      0.00      1177
           5       0.00      0.00      0.00      2447
           6       0.00      0.00      0.00      1663
           7       0.00      0.00      0.00      3429
           8       0.00      0.00      0.00      1182
           9       0.00      0.00      0.00      2191
          10       0.07      1.00      0.13      2119
          11       0.00      0.00      0.00      2156

    accuracy                           0.07     30159
   macro avg       0.01      0.08      0.01     30159
weighted avg       0.00      0.07      0.01     30159



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Задание 3: Рубрикация ELMO/etc.

In [None]:
%%capture
!pip install razdel
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!gzip -d lenta-ru-news.csv.gz
!head -n 2 lenta-ru-news.csv

In [None]:
%%capture
!pip install transformers sentencepiece

In [None]:
import pandas as pd
import re
import datetime as dt
from razdel import tokenize, sentenize
from string import punctuation

def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)

dataset = pd.read_csv("lenta-ru-news.csv", sep=',', quotechar='\"', escapechar='\\', encoding='utf-8', header=0)
dataset["date"] = dataset["url"].apply(lambda x: dt.datetime.strptime(get_date(x), "%Y/%m/%d"))
dataset = dataset[dataset["date"] > "2017-01-01"]
dataset["text"] = dataset["text"].apply(lambda x: x.replace("\xa0", " "))
dataset["title"] = dataset["title"].apply(lambda x: x.replace("\xa0", " "))
train_dataset = dataset[dataset["date"] < "2018-04-01"]
test_dataset = dataset[dataset["date"] > "2018-04-01"]

In [None]:
def get_texts(dataset):
    texts = []
    for text in dataset["text"]:
        for sentence in sentenize(text):
            texts.append([token.text.lower() for token in tokenize(sentence.text) if token.text not in punctuation])
    
    for title in dataset["title"]:
        texts.append([token.text.lower() for token in tokenize(title) if token.text not in punctuation])
    return texts

In [None]:
texts = get_texts(train_dataset)
test_texts = get_texts(test_dataset)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model.to(device)

In [None]:
def get_text_embedding(tokenizer, model, texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt').to(model.device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings).cpu().detach().numpy()
    return embeddings

In [None]:
target_labels = set(train_dataset["topic"].dropna().tolist())
target_labels -= {"69-я параллель", "Крым", "Культпросвет ", "Оружие", "Бизнес", "Путешествия"}
target_labels = list(target_labels)
print(target_labels)

['Из жизни', 'Культура', 'Экономика', 'Интернет и СМИ', 'Мир', 'Дом', 'Ценности', 'Бывший СССР', 'Россия', 'Силовые структуры', 'Спорт', 'Наука и техника']


In [None]:
pattern = r'(\b{}\b)'.format('|'.join(target_labels))

In [None]:
train_with_topics = train_dataset[train_dataset["topic"].str.contains(pattern, case=False, na=False)]
train_with_topics = train_with_topics.head(20000)

test_with_topics = test_dataset[test_dataset["topic"].str.contains(pattern, case=False, na=False)]

  """Entry point for launching an IPython kernel.
  after removing the cwd from sys.path.


In [None]:
y_train = train_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_train = np.zeros((train_with_topics.shape[0], 768))

In [None]:
for i, text in tqdm(enumerate(train_with_topics["text"])):
    X_train[i, :] = get_text_embedding(tokenizer, model, text)

20000it [07:23, 45.13it/s]


In [None]:
y_test = test_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_test = np.zeros((test_with_topics.shape[0], 768))

In [None]:
for i, text in tqdm(enumerate(test_with_topics["text"])):
    X_test[i, :] = get_text_embedding(tokenizer, model, text)

print(X_train.shape)
print(y_train)

30159it [11:33, 43.50it/s]

(20000, 768)
[4 4 1 ... 3 2 6]





In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

clf = MLPClassifier()
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, y_predicted))



              precision    recall  f1-score   support

           0       0.83      0.73      0.77      2191
           1       0.85      0.85      0.85      1995
           2       0.82      0.82      0.82      3185
           3       0.74      0.70      0.72      2447
           4       0.77      0.85      0.81      4291
           5       0.78      0.82      0.80      1182
           6       0.85      0.82      0.84      1177
           7       0.77      0.74      0.75      2156
           8       0.70      0.73      0.72      4324
           9       0.72      0.64      0.68      1663
          10       0.93      0.95      0.94      3429
          11       0.86      0.87      0.87      2119

    accuracy                           0.80     30159
   macro avg       0.80      0.79      0.80     30159
weighted avg       0.80      0.80      0.80     30159

