In [None]:
!wget https://github.com/sagespl/nlp-masterclass/blob/main/modu%C5%82-07/word2vec_data.zip?raw=true
!mv word2vec_data.zip?raw=true word2vec_data.zip
!unzip word2vec_data.zip

#### Załadowanie tekstów

In [None]:
import os
import re


def preprocess(doc, divider=None):
    lowered = doc.lower()
    chars = [c for c in lowered]
    if divider is None: 
        data = [chars]
    else:
        data = []
        num_segments = len(chars)// divider + int(len(chars)%divider>0)
        for i in range(num_segments):
            data.append(chars[i*divider:(i+1)*divider])
    return data

train_docs = []
folder = "word2vec_data"
for file in os.listdir(folder):
    fullpath = os.path.join(folder, file)
    with open(fullpath) as f:
        txt = f.read()
    docs = []
    for doc in txt.split("\n"):
        if doc!="":
            docs.extend(preprocess(doc, divider = 80))
    train_docs.extend(docs)


In [None]:
print(len(train_docs))
print(train_docs[0])
charset = set([])
PAD = 0
for doc in train_docs:
    charset.update(doc)

charlist = sorted(list(charset))
charlist.insert(PAD, "PAD")
print(charlist)
num_chars = len(charlist)
print(num_chars)

In [None]:
import torch
import numpy
import random

DEVICE = torch.device("cuda:0")

def examples_to_batch(examples, maxlen):
    indices = []
    for e in examples:
        indices.append([charlist.index(c) for c in e[:maxlen]])   
    len_indices = [(i, len(toks)) for i, toks in list(enumerate(indices))] 
    len_indices = sorted(len_indices, key=lambda x:x[1], reverse=True) 
    indices = [indices[i[0]] for i in len_indices]  
    lens = [len(e) for e in indices]
    maxlen = max(lens)
    for seq in indices:
        while len(seq) < maxlen:
            seq.append(PAD)
    lens = torch.LongTensor([l-1 for l in lens]) 
    X = []
    for x in indices:
        vex = [numpy.zeros(num_chars,) for tok in x[:-1]]
        for i, tok in enumerate(x[:-1]):
            vex[i][tok] = 1
        X.append(vex)       
    Y = [y[1:] for y in indices]
    Y_mask = [[int(x != PAD) for x in y[1:]] for y in indices]
    Y_mask = torch.BoolTensor(Y_mask).transpose(1,0) 
    X = torch.Tensor(X).transpose(1,0)
    Y = torch.LongTensor(Y).transpose(1,0)
    return X, Y, lens, Y_mask 

    
def train_on_batch(X, lens, Y, mask, model, optimizer, teacher_forcing_ratio):
    model.train()
    batch_size = X.shape[1]
    optimizer.zero_grad()
    Y = Y.to(DEVICE)
    mask = mask.to(DEVICE)
    loss = 0
    generator_input = X[0].unsqueeze(0).to(DEVICE)
    generator_hidden = model.init_hidden(batch_size)
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    max_target_len = max(lens)
    if use_teacher_forcing:
        for t in range(max_target_len):
            generator_output, generator_hidden = model(
                generator_input, generator_hidden)          
            generator_input = Y[t].view(1, -1)          
            step_tokens = Y[t]
            step_vex = [numpy.zeros(num_chars,) for token in step_tokens]
            for i, tok in enumerate(step_tokens):
                step_vex[i][tok] = 1
            generator_input = torch.Tensor(step_vex).unsqueeze(0).to(DEVICE)         
            mask_loss, _, _ = maskNLLLoss(generator_output, Y[t], mask[t])
            loss += mask_loss
    else:
        for t in range(max_target_len):
            generator_output, generator_hidden = model(
                generator_input, generator_hidden)
            _, topi = generator_output.topk(1)
            topi = topi.cpu()
            step_vex = [numpy.zeros(num_chars,) for token in topi.squeeze()]
            for i, tok in enumerate(topi):
                step_vex[i][tok] = 1     
            generator_input = torch.Tensor(step_vex).unsqueeze(0).to(DEVICE)
            mask_loss, _, _ = maskNLLLoss(generator_output, Y[t], mask[t])
            loss += mask_loss
    loss.backward()
    _ = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    return loss.item()




def test_on_batch(X, lens, Y, mask, model, teacher_forcing):
    model.train()
    batch_size = X.shape[1]
    Y = Y.to(DEVICE)
    mask = mask.to(DEVICE)
    loss = 0
    generator_input = X[0].unsqueeze(0).to(DEVICE)
    generator_hidden = model.init_hidden(batch_size)
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    decisions = []
    max_target_len = max(lens)
    corr = 0
    total = 0 
    if teacher_forcing:
        for t in range(max_target_len):
            generator_output, generator_hidden = model(
                generator_input, generator_hidden)          
            generator_input = Y[t].view(1, -1)
            step_tokens = Y[t]
            step_vex = [numpy.zeros(num_chars,) for token in step_tokens]
            for i, tok in enumerate(step_tokens):
                step_vex[i][tok] = 1
            generator_input = torch.Tensor(step_vex).unsqueeze(0).to(DEVICE)  
            mask_loss, ncorr, ntotal = maskNLLLoss(generator_output, Y[t], mask[t])
            total += ntotal
            corr += ncorr
            loss += mask_loss  
    else:
        for t in range(max_target_len):
            generator_output, generator_hidden = model(
                generator_input, generator_hidden)
            _, topi = generator_output.topk(1)
            topi = topi.cpu()
            decisions.append(topi)
            step_vex = [numpy.zeros(num_chars,) for token in topi.squeeze()]
            for i, tok in enumerate(topi):
                step_vex[i][tok] = 1
            generator_input = torch.Tensor(step_vex).unsqueeze(0).to(DEVICE)
            mask_loss, ncorr, ntotal = maskNLLLoss(generator_output, Y[t], mask[t])
            total += ntotal
            corr += ncorr
            loss += mask_loss      
    return loss.item(), corr, total, decisions


In [None]:
class GRUGenerator(torch.nn.Module):
    def __init__(self, char_num, hidden_size, dropout):
        super(GRUGenerator, self).__init__()
        self.hidden_size = hidden_size
        self.dropout = torch.nn.Dropout(dropout)
        self.gru = torch.nn.GRU(char_num, hidden_size, num_layers=2, batch_first=False)
        self.classifier = torch.nn.Linear(hidden_size, char_num)
        self.softmax = torch.nn.Softmax(dim=2)
    
    def init_hidden(self, batch_size):
        return torch.zeros(2, batch_size, self.hidden_size).to(DEVICE)
    
    def forward(self, input_step, last_hidden):
        rnn_output, hidden = self.gru(input_step, last_hidden)
        output = self.softmax(self.classifier(self.dropout(rnn_output)))
        return output, hidden


In [None]:
def maskNLLLoss(inp, target, mask):
    total = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp.squeeze(), 1, target.view(-1, 1)).squeeze(1))
    decision = inp.topk(1).indices.squeeze()
    corr = (decision == target).sum()
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(DEVICE)
    return loss, corr, total


In [None]:
from tqdm.notebook import tqdm
import random

maxlen = 80
batch_size = 128
hidden_size = 600
dropout = 0.5
model = GRUGenerator(num_chars, hidden_size, dropout).to(DEVICE)
learning_rate = 0.001
epochs = 20
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


random.shuffle(train_docs)
train_docs = [t for t in train_docs if len(t)>3]
train_data = train_docs[:-200]
dev_data = train_docs[-200:]

train_iters = len(train_data)//batch_size + int(len(train_data)%batch_size>0)
dev_iters = len(dev_data)//batch_size + int(len(train_data)%batch_size>0)

for epoch in range(epochs):
    print("epoch no: ", epoch+1)
    teacher_forcing_ratio = 0.95**epoch
    random.shuffle(train_data)
    total_loss = 0
    for i in tqdm(range(train_iters)):
        examples = train_data[i*batch_size:(i+1)*batch_size]
        X, Y, lens, mask = examples_to_batch(examples, maxlen)
        optimizer.zero_grad()
        loss = train_on_batch(X, lens, Y, mask, model, optimizer, teacher_forcing_ratio)
        total_loss += loss
    print("train loss: ", total_loss)
    with torch.no_grad():
        total = 0
        correct = 0
        dev_loss = 0
        for i in range(dev_iters):
            examples = dev_data[i*batch_size:(i+1)*batch_size]
            X, Y, lens, mask = examples_to_batch(examples, maxlen)
            loss, corr, tot, decisions = test_on_batch(X, lens, Y, mask, model, True)
            dev_loss += loss
            correct += corr.item()
            total += tot.item()
        print("dev loss:", dev_loss)
        acc = (correct/total) * 100
        accuracy = "{:4.2f}%".format((acc))
        print(accuracy)

In [None]:
def oscillate(model, seed, maxlen):
    indices = [charlist.index(c) for c in seed.lower()]
    vex = [numpy.zeros(num_chars,) for tok in indices]
    for i, tok in enumerate(indices):
        vex[i][tok] = 1
    X = torch.Tensor(vex)

    generator_input = torch.Tensor(X[0]).unsqueeze(0).unsqueeze(0).to(DEVICE)

    generator_hidden = model.init_hidden(1)

    for t in range(1, len(X)):
        generator_output, generator_hidden = model(
            generator_input, generator_hidden)
        
        generator_input = torch.Tensor(X[t]).unsqueeze(0).unsqueeze(0).to(DEVICE)
    
    decisions = []
    for t in range(maxlen):
        generator_output, generator_hidden = model(
            generator_input, generator_hidden)
        _, topi = generator_output.topk(1)

        topi = topi.cpu()
        decision = topi.squeeze().item()
        decisions.append(decision)

        step_vex = [numpy.zeros(num_chars,)]

        step_vex[0][decision]= 1

        generator_input = torch.Tensor(step_vex).unsqueeze(0).to(DEVICE)
        
    return decisions     

seed = "Przyszedł wtedy pan "
decisions = oscillate(model, seed, 200)
print(seed+"".join([charlist[i] for i in decisions]))