In [4]:
import torch
from torch import nn
from torch.autograd import Variable
import numpy as np
import pandas as pd
import torch.functional as F
import torch.nn.functional as F
import pymorphy2
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import regex as re
from unicodedata import normalize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
morph = pymorphy2.MorphAnalyzer()
stopwords_vocabulary = stopwords.words("russian")

In [75]:
df = pd.read_excel('./sample_data/people_dataset.xlsx')[:100]
df.head()

Unnamed: 0,url,title,text,topic,tags,date
0,https://lenta.ru/news/2013/01/12/spears/,Бритни Спирс расторгла помолвку и ушла из жюри...,Певица Бритни Спирс объявила о расторжении пом...,Из жизни,Люди,2013/01/12
1,https://lenta.ru/news/2013/01/13/miss/,"Титул ""Мисс Америка"" выиграла уроженка Бруклина",Уроженка нью-йоркского района Бруклин завоевал...,Из жизни,Люди,2013/01/13
2,https://lenta.ru/news/2013/01/14/foster/,Джоди Фостер рассказала о своей нетрадиционной...,Актриса Джоди Фостер впервые публично затронул...,Из жизни,Люди,2013/01/14
3,https://lenta.ru/news/2013/01/14/shark/,Киевляне вступились за акулу из торгового центра,В Киеве прошла акция протеста против администр...,Из жизни,Люди,2013/01/14
4,https://lenta.ru/news/2013/01/14/engaged/,Оливия Уайлд собралась замуж,Актриса Оливия Уайлд обручилась со своим возлю...,Из жизни,Люди,2013/01/14


In [76]:
df['title'] = df['title'].apply(lambda x: normalize('NFKD', x)).apply(lambda x: re.sub(r"\p{P}+", "", x)).apply(lambda x: re.sub(r'[^\w\s]+|[\d]+', r'',x).strip())
corpus = list(df.title)

In [77]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    tokens = [[morph.parse(token)[0].normal_form for token in sentence if morph.parse(token)[0].normal_form not in stopwords_vocabulary] for sentence in tokens]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
print(tokenized_corpus)

[['бритни', 'спирс', 'расторгнуть', 'помолвка', 'уйти', 'жюри', 'шоу', 'x', 'factor'], ['титул', 'мисс', 'америка', 'выиграть', 'уроженка', 'бруклин'], ['джодить', 'фостер', 'рассказать', 'своей', 'нетрадиционной', 'сексуальной', 'ориентация'], ['киевлянин', 'вступиться', 'акула', 'торговый', 'центр'], ['оливия', 'уайлд', 'собраться', 'замуж'], ['принц', 'уильям', 'кейт', 'миддлтон', 'стать', 'родитель', 'июль'], ['китайский', 'детский', 'сад', 'поженить', 'детей'], ['библиотека', 'айов', 'запретить', 'спать'], ['чарли', 'шина', 'стать', 'дед'], ['легендарный', 'австралийский', 'преступник', 'похоронить', 'спустя', 'год', 'казнь'], ['элтон', 'джон', 'подтвердить', 'рождение', 'второй', 'сын'], ['gq', 'включить', 'анна', 'чапман', 'список', 'самый', 'сексуальный', 'женщина', 'век'], ['составить', 'рейтинг', 'самый', 'завидный', 'холостяк', 'мир'], ['полуголый', 'продавщица', 'платье', 'стать', 'звездой', 'сеть'], ['пятилетний', 'американка', 'наказать', 'терроризм'], ['летн

In [78]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [79]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [80]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [81]:
class MyModule(nn.Module):
    def __init__(self, embedding_dims, vocabulary_size):
        super().__init__()
        self.f_lin = nn.Linear(vocabulary_size, embedding_dims)
        self.s_lin = nn.Linear(embedding_dims, vocabulary_size)
        
    def forward(self, X):
        X = self.s_lin(self.f_lin(X))
        return F.log_softmax(X, dim=0)

In [82]:
embedding_dims = 5
model = MyModule(embedding_dims=embedding_dims, vocabulary_size=vocabulary_size)
loss_fn = nn.NLLLoss()
optim = torch.optim.Adam(model.parameters(), 0.001)

In [83]:
 # row
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 200
# learning_rate = 0.001
# learning_rate = 0.001
loss_values = []
for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        # x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        y_pred = model(x)
        loss = loss_fn(y_pred.view(1, -1), y_true)
        loss_val += loss.data.item()

        optim.zero_grad()
        loss.backward()
        optim.step()

        # # forward
        # y_pred = model(X)

        # # loss
        # loss = loss_fn(y_pred, y)
        # losses.append(loss.item())


        # z1 = torch.matmul(W1, x)
        # z2 = torch.matmul(W2, z1)
    
        # log_softmax = F.log_softmax(z2, dim=0)
        # # print(log_softmax.view(1,-1))
        # loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        # loss_val += loss.data.item()
        # loss.backward()
        # W1.data -= learning_rate * W1.grad.data
        # W2.data -= learning_rate * W2.grad.data

        # W1.grad.data.zero_()
        # W2.grad.data.zero_()
    if epo % 1 == 0:
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')
        loss_values.append(loss_val/len(idx_pairs))

Loss at epo 0: 6.421638150688683
Loss at epo 1: 6.19831567438315
Loss at epo 2: 6.120722588160297
Loss at epo 3: 6.025962739012808
Loss at epo 4: 5.939451291434484
Loss at epo 5: 5.852790402072131
Loss at epo 6: 5.7593476735141484
Loss at epo 7: 5.657850109402082
Loss at epo 8: 5.551004824544761
Loss at epo 9: 5.440859413725124
Loss at epo 10: 5.328965988775875
Loss at epo 11: 5.218053213167962
Loss at epo 12: 5.110161939201399
Loss at epo 13: 5.0058574046053455
Loss at epo 14: 4.905633791489634
Loss at epo 15: 4.809701596754528
Loss at epo 16: 4.717679357900355
Loss at epo 17: 4.629429951069262
Loss at epo 18: 4.544770481198269
Loss at epo 19: 4.463594709202544
Loss at epo 20: 4.38575958854607
Loss at epo 21: 4.311151708865551
Loss at epo 22: 4.239625776543628
Loss at epo 23: 4.171035947136185
Loss at epo 24: 4.105265798662331
Loss at epo 25: 4.042183432994759
Loss at epo 26: 3.981738445252249
Loss at epo 27: 3.9238252105668843
Loss at epo 28: 3.86839567713319
Loss at epo 29: 3.815340

In [61]:
import plotly.graph_objects as go

In [84]:
x = np.arange(0, len(loss_values))
fig = go.Figure(data=go.Scatter(x=x, y=loss_values))
fig.show()

In [97]:
def get_top_grams(word, n=5):
  word_normal = morph.parse(word)[0].normal_form
  x = Variable(get_input_layer(word2idx[word_normal])).float()
  z2 = model(x)
  df = pd.DataFrame({'input_word': word}, index=np.arange(0, n))
  df['target_word'] = [idx2word[x.data.item()] for x in torch.topk(z2, n)[1]]
  df['target_values'] = [x.data.item() for x in torch.topk(z2, n)[0]]
  num = df['target_values'].min() * (-1)
  df['target_values'] = df['target_values'].apply(lambda x: (num - (x * -1)) * 10 + 1)
  return df

In [99]:
import seaborn as sns
cm = sns.light_palette("blue", as_cmap=True)

In [105]:
word = 'Америка'
predicted_df = get_top_grams(word, 6)
predicted_df.style.background_gradient(cm, axis=0)

Unnamed: 0,input_word,target_word,target_values
0,Америка,мисс,15.82707
1,Америка,выиграть,15.818716
2,Америка,америка,14.186476
3,Америка,уроженка,8.20515
4,Америка,титул,4.83872
5,Америка,бруклин,1.0
