In [1]:
from data4ml import Data4TextGeneration
from time import time
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time, sleep

%load_ext autoreload
%autoreload 2


import numpy as np
from collections import Counter
import os
from argparse import Namespace


flags = Namespace(
    seq_size=32,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)


In [2]:
data4gen = Data4TextGeneration(path_to_config='./../data_params.json')
data4gen.home_folder = './../../messages'
res = data4gen.make_data(limit=2)

100%|██████████| 420/420 [08:37<00:00,  1.23s/it]


In [6]:
import re

def make_first_char_upper(sent: list):
    sent[0] = sent[0][0].upper() + sent[0][1:]
    
def add_dot_in_the_end(sent: list):
    sent[-1] = sent[-1] + '.' if sent[-1][-1].isalpha() else sent[-1]
    
def tokenize(corpus: list) -> list:
    tokenize_messages = []
    for dialog in corpus:
        for message in dialog:
            sentence = [re.sub("[.,:@-]", "", i) for i in message.lower().split() if len(re.sub("[.,:@-]", "", i)) > 0]
            if sentence:
                make_first_char_upper(sentence)
                add_dot_in_the_end(sentence)
                tokenize_messages.extend(sentence)
    return tokenize_messages

In [2]:
def get_data_from_file(text, batch_size, seq_size):

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text

In [4]:
def save_text(text: list):
    with open('text.txt', 'w') as f:
        f.write(' '.join(text))

def read_text(file='text.txt') -> list:
    with open('text.txt', 'r') as f:
        text = f.read().split()
        return text

In [5]:
text = read_text()
print(text[30:50])

['будешь', 'дома.', 'Чтобы', 'я', 'тебе', 'всё', 'передал.', 'Окей.', 'Ты', 'завтра', 'работать', 'не', 'планируешь', 'что', 'ли?', 'Мы', 'же', 'должны', 'были', 'сегодня']


In [12]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]


class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state

    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))


def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer


def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()
    words = ['Привет', 'как']

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])

    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])
    
    print('сейчас что-то будет...')
    print(' '.join(words))
    print('было...')


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(
    text, flags.batch_size, flags.seq_size)

net = RNNModule(n_vocab, flags.seq_size,
                flags.embedding_size, flags.lstm_size)
net = net.to(device)

criterion, optimizer = get_loss_and_train_op(net, 0.01)

iteration = 0
start = time()

for e in range(200):
    batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
    state_h, state_c = net.zero_state(flags.batch_size)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for x, y in batches:
        iteration += 1
        net.train()

        optimizer.zero_grad()

        x = torch.tensor(x).to(device)
        y = torch.tensor(y).to(device)

        logits, (state_h, state_c) = net(x, (state_h, state_c))
        loss = criterion(logits.transpose(1, 2), y)

        loss_value = loss.item()

        loss.backward()

        state_h = state_h.detach()
        state_c = state_c.detach()

        _ = torch.nn.utils.clip_grad_norm_(
            net.parameters(), flags.gradients_norm)

        optimizer.step()

        if iteration % 10 == 0:
            print(f'iteration = {iteration}')
            print("Elapsed time: {:.3f} sec".format(time() - start))
            start = time()
            
        if iteration % 100 == 0:
            print('Epoch: {}/{}'.format(e, 200),
                  'Iteration: {}'.format(iteration),
                  'Loss: {}'.format(loss_value))
            torch.save(net.state_dict(),
                       'checkpoint_pt/model-{}.pth'.format(iteration))

        if iteration % 1000 == 0:
            predict(device, net, flags.initial_words, n_vocab,
                    vocab_to_int, int_to_vocab, top_k=5)
            


Vocabulary size 324259
iteration = 10
Elapsed time: 831.464 sec
iteration = 20
Elapsed time: 695.964 sec
iteration = 30
Elapsed time: 738.748 sec
iteration = 40
Elapsed time: 816.733 sec
iteration = 50
Elapsed time: 1357.573 sec
iteration = 60
Elapsed time: 1185.080 sec
iteration = 70
Elapsed time: 1380.037 sec
iteration = 80
Elapsed time: 1618.401 sec
iteration = 90
Elapsed time: 1379.346 sec
iteration = 100
Elapsed time: 1526.700 sec
Epoch: 0/200 Iteration: 100 Loss: 8.92923641204834


FileNotFoundError: [Errno 2] No such file or directory: 'checkpoint_pt/model-100.pth'

In [8]:
iteration

100

In [11]:
predict(device, net, flags.initial_words, n_vocab,
                    vocab_to_int, int_to_vocab, top_k=5)

сейчас что-то будет...
b'\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82 \xd0\xba\xd0\xb0\xd0\xba \xd0\xbd\xd0\xb5 \xd0\xbd\xd0\xb5 \xd0\xaf \xd0\xb8 \xd0\xb2 \xd0\xaf \xd1\x87\xd1\x82\xd0\xbe \xd1\x8f \xd0\xb8 \xd1\x8f \xd0\xb8 \xd1\x8f \xd0\xbd\xd0\xb5 \xd0\xb2 \xd0\xaf \xd0\xaf \xd0\xb8 \xd0\xbd\xd0\xb5 \xd0\xb2 \xd0\xbd\xd0\xb5 \xd0\xbd\xd0\xb0 \xd1\x81 \xd0\x9d\xd1\x83 \xd1\x8f \xd0\xb8 \xd1\x8f \xd0\xb2 \xd0\xaf \xd0\xb2 \xd0\xb2 \xd0\xaf \xd1\x87\xd1\x82\xd0\xbe \xd0\xbd\xd0\xb5 \xd0\xb2 \xd0\xb2 \xd0\xaf \xd1\x87\xd1\x82\xd0\xbe \xd1\x8f \xd1\x8f \xd1\x8f \xd1\x8f \xd0\xaf \xd0\xaf \xd0\xb8 \xd0\xb8 \xd0\xb2 \xd0\xb2 \xd0\xaf \xd0\xb2 \xd0\xb8 \xd0\xb2 \xd0\xb8 \xd0\xbd\xd0\xb5 \xd0\xb2 \xd0\xb8 \xd0\xaf \xd1\x87\xd1\x82\xd0\xbe \xd0\xb8 \xd0\xb8 \xd0\xb8 \xd0\xb8 \xd0\xb8 \xd0\xbd\xd0\xb5 \xd0\xbd\xd0\xb5 \xd0\xbd\xd0\xb5 \xd0\xbd\xd0\xb0 \xd1\x81 \xd0\xbd\xd0\xb5 \xd0\xbd\xd0\xb0 \xd1\x8f \xd1\x8f \xd0\xbd\xd0\xb5 \xd0\xbd\xd0\xb5 \xd0\xbd\xd0\xb5 \xd0\xb8 \xd0\xb8 \xd0\xaf 

In [83]:
RES = []

for dialogs in res:
    ss = []
    for message in dialogs:
        ss.extend([x.lower().replace('[,.?]', '') for x in message.split()])
    RES.append(ss)

In [81]:
len(RES)

96

In [84]:
print(RES[1][:10])

['серега,', 'привету', 'тебя', 'есть', 'телевизор', 'на', 'даче?', 'просто', 'если', 'что,']


In [65]:
sentences = bigram[sent]

NameError: name 'bigram' is not defined

In [60]:
from gensim.models.phrases import Phrases, Phraser

In [58]:
phrases = Phrases(RES, min_count=30, progress_per=10000)

In [66]:
bigram = Phraser(phrases)

In [69]:
sentences = bigram[RES]

In [72]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

321400

In [73]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['не', 'Я', 'я', 'и', 'в', 'А', 'Ну', 'И', 'ты', 'на']

In [74]:
w2v_mode = kek(sentences)

Time to build vocab: 0.17 mins
Time to train the model: 3.78 mins


In [12]:
import multiprocessing

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

In [12]:
import multiprocessing

cores = multiprocessing.cpu_count()
cores

4

In [34]:
def kek(res):
    w2v_model = Word2Vec(min_count=20,
                         window=2,
                         size=300,
                         sample=6e-5, 
                         alpha=0.03, 
                         min_alpha=0.0007, 
                         negative=20,
                         workers=cores-1)

    t = time()

    w2v_model.build_vocab(res, progress_per=10000)

    print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

    w2v_model.corpus_count

    t = time()

    w2v_model.train(res, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

    print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

    w2v_model.init_sims(replace=True)

    return w2v_model

In [35]:
w2v_model_1 = kek(RES)

Time to build vocab: 0.05 mins
Time to train the model: 0.44 mins


In [78]:
w2v_mode.wv.most_similar(positive=["Юля"])

[('Написала', 0.8950090408325195),
 ('Ване', 0.8853252530097961),
 ('говорит?', 0.8794123530387878),
 ('ВСЕ', 0.8751375079154968),
 ('звонила?', 0.8749449253082275),
 ('Богу', 0.8741962909698486),
 ('покажи', 0.8736038208007812),
 ('надоел', 0.8734161853790283),
 ('Ахахахаах', 0.8711090087890625),
 ('вк?', 0.8618061542510986)]

In [79]:
w2v_model_1.wv.most_similar(positive=["Юля"])

[('память', 0.7860834002494812),
 ('инсту', 0.7789162397384644),
 ('Ии', 0.7788558006286621),
 ('недоступна', 0.7754437923431396),
 ('покажи', 0.7736333608627319),
 ('берешь', 0.7618379592895508),
 ('Ане', 0.7555817365646362),
 ('Эту', 0.7530922293663025),
 ('фотки)', 0.7501952648162842),
 ('Ване', 0.7494977116584778)]