# Przygotwanie słownika

In [29]:
import collections
import os

UNK = "<unk>"
SOS = "<s>"
EOS = "</s>"

DATA_DIR = "./corpora" # sciezka do plików z danymi
PL_SENTENCES = "sentences_pl.txt" # plik z polskimi zdaniami
EN_SENTENCES = "sentences_en.txt" # plik z polskimi zdaniami
PL_VOCAB = "vocab_pl.txt" #plik z polskim słownikiem
EN_VOCAB = "vocab_en.txt" #plik z angielskim słownikiem

pl_sentences_path = os.path.join(DATA_DIR, PL_SENTENCES)
en_sentences_path = os.path.join(DATA_DIR, EN_SENTENCES)
pl_vocab_path = os.path.join(DATA_DIR, PL_VOCAB)
en_vocab_path = os.path.join(DATA_DIR, EN_VOCAB)

def createVocabulary(sentences_path, vocab_path, vocabulary_size = 50000):
    sentences_file = open(sentences_path, "r")
    sentences = sentences_file.read().replace('\n', ' ').replace('\r', '').split()
    count = [[UNK, -1], [SOS, -1], [EOS, -1]]
    count.extend(collections.Counter(sentences).most_common(vocabulary_size))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in sentences:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    vocab = ''
    vocab_file = open(vocab_path, "w")
    for word in list(dictionary.keys()):
        vocab_file.write(word + "\n")
    vocab_file.close()
    sentences_file.close()
    
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary
    
# createVocabulary(pl_sentences_path, pl_vocab_path)
# createVocabulary(en_sentences_path, en_vocab_path)

## Przygotowanie embedingsów

### Generowanie batchów

In [37]:
def generate_batch(batch_size, num_skips, skip_window, data, data_index):
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
    if data_index + span > len(data):
        data_index = 0
        
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels, data_index


## Skip-gram model

In [38]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import math
import random
from six.moves import xrange

def createEmbeddings(sentences_path, vocab_path, log_dir = DATA_DIR):
    data_index = 0
    data, count, dictionary, reverse_dictionary = createVocabulary(sentences_path, vocab_path)
    batch, labels, data_index = generate_batch(8, 2, 1, data, data_index)
    print(batch)
    for i in range(8):
        print(batch[i])
        print(reverse_dictionary[batch[i]])
        print(labels[i, 0])
        print(reverse_dictionary[labels[i, 0]])
        print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
              reverse_dictionary[labels[i, 0]])
    vocabulary_size = len(dictionary)
    
    batch_size = 128
    embedding_size = 128  # Dimension of the embedding vector.
    skip_window = 1  # How many words to consider left and right.
    num_skips = 2  # How many times to reuse an input to generate a label.
    num_sampled = 64  # Number of negative examples to sample.
    learning_rate = 1.0 # learning rate
    num_steps = 100001 # ilość kroków


    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    graph = tf.Graph()

    with graph.as_default():

        with tf.name_scope('inputs'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
            valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

            with tf.name_scope('weights'):
                nce_weights = tf.Variable(
                    tf.truncated_normal([vocabulary_size, embedding_size],stddev=1.0 / math.sqrt(embedding_size))
                )
                
            with tf.name_scope('biases'):
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        with tf.name_scope('loss'):
            loss = tf.reduce_mean(tf.nn.nce_loss(
                weights=nce_weights,
                biases=nce_biases,
                labels=train_labels,
                inputs=embed,
                num_sampled=num_sampled,
                num_classes=vocabulary_size
            ))

        tf.summary.scalar('loss', loss)
        with tf.name_scope('optimizer'):
            optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

        # normalizacja minibatcha
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

        # Merge all summaries.
        merged = tf.summary.merge_all()

        # Add variable initializer.
        init = tf.global_variables_initializer()

        # Create a saver.
        saver = tf.train.Saver()

    with tf.Session(graph=graph) as session:
        writer = tf.summary.FileWriter(log_dir, session.graph)

        # We must initialize all variables before we use them.
        init.run()
        print('Initialized')

        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels, data_index = generate_batch(batch_size, num_skips, skip_window, data, data_index)
            feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

            run_metadata = tf.RunMetadata()

            _, summary, loss_val = session.run(
                [optimizer, merged, loss],
                feed_dict=feed_dict,
                run_metadata=run_metadata)
            average_loss += loss_val
            writer.add_summary(summary, step)

            #wizualizacja danych
            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            #average loss
            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0

            if step % 10000 == 0:
                sim = similarity.eval()
                for i in xrange(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in xrange(top_k):
                        close_word = reverse_dictionary[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)

        final_embeddings = normalized_embeddings.eval()

        # Write corresponding labels for the embeddings.
        with open(log_dir + '/metadata.tsv', 'w') as f:
            for i in xrange(vocabulary_size):
                f.write(reverse_dictionary[i] + '\n')

        # Save the model for checkpoints.
        saver.save(session, os.path.join(log_dir, 'model.ckpt'))

    writer.close()
    
createEmbeddings(pl_sentences_path, pl_vocab_path)

[ 874  874    0    0    5    5 2790 2790]
874
ustne
0
<unk>
874 ustne -> 0 <unk>
874
ustne
3202
zapytanie
874 ustne -> 3202 zapytanie
0
<unk>
5
na
0 <unk> -> 5 na
0
<unk>
874
ustne
0 <unk> -> 874 ustne
5
na
0
<unk>
5 na -> 0 <unk>
5
na
2790
sesję
5 na -> 2790 sesję
2790
sesję
779
pytań
2790 sesję -> 779 pytań
2790
sesję
5
na
2790 sesję -> 5 na
Initialized
Average loss at step  0 :  287.7437744140625
Nearest to r: obermayr, nieodłącznie, cornillet, neutralne, mówić, leży, schematu, konstytucją,
Nearest to de: rolandas, ciążą, nowością, ogólnodostępnej, marí, eliminować, obsługą, charles,
Nearest to 2: przetwórcy, państwu, korygowania, imię, martina, zbywalne, religijnych, mikroprzedsiębiorstwo,
Nearest to rozporządzenia: ursula, 8e, sole, suplementów, dystrybucją, wymagań, pe423778v0200, zwiększaniem,
Nearest to decyzji: pe456997v0100, arktyki, rentowne, estońskim, interpretacja, zapadać, poniosły, trójstronny,
Nearest to zakresie: odtwarzania, przyczyniającymi, oznaczenia, fonogramu, o

Average loss at step  42000 :  3.6827076913565397
Average loss at step  44000 :  3.995631938233972
Average loss at step  46000 :  3.9387468843758104
Average loss at step  48000 :  3.814999472692609
Average loss at step  50000 :  3.87213316000998
Nearest to r: roku, cornillet, grudnia, schematu, mówić, konstytucją, zależało, <unk>,
Nearest to de: van, charles, silviu, ekonomicznospołeczny, władzom, pse, xml, 2000527we,
Nearest to 2: 3, 4, 9, państwu, 1, 5, 6, martina,
Nearest to rozporządzenia: dyrektywy, decyzji, nr, kosmicznej, karną, traktatu, tajwanem, negocjacyjnej,
Nearest to decyzji: rozporządzenia, dyrektywy, obowiązku, wręcz, władzami, poniosły, życzenia, deklaracja,
Nearest to zakresie: sprawie, dziedzinie, poignant, zbliżające, imieniu, celu, tej, potężne,
Nearest to przypadku: imieniu, bankami, ogłoszona, sprawie, pilnych, terminie, przybrzeżnej, sytuacjach,
Nearest to których: wypłacanych, 1999468, guantanamo, dochodzeniem, 2225, tsunami, kompromisowe, wnioskami,
Nearest to

Average loss at step  92000 :  3.0563873945772646
Average loss at step  94000 :  2.8562956560626627
Average loss at step  96000 :  3.1114870708733795
Average loss at step  98000 :  2.828949210897088
Average loss at step  100000 :  3.0767025324180723
Nearest to r: roku, grudnia, czerwca, maja, lipca, socjalistyczna, września, budżetowy,
Nearest to de: van, silviu, charles, impress, kolarskabobińska, bélier, skontaktowania, 2000527we,
Nearest to 2: 3, 4, 1, 5, 9, 7, 6, 8,
Nearest to rozporządzenia: dyrektywy, decyzji, nr, 10731999, traktatu, kosmicznej, karną, negocjacyjnej,
Nearest to decyzji: dyrektywy, rozporządzenia, obowiązku, deklaracja, centralni, wręcz, rezolucji, przedmiotowo,
Nearest to zakresie: dziedzinie, sprawie, imieniu, fonogramu, ramach, celu, tej, odniesieniu,
Nearest to przypadku: imieniu, terminie, sprawie, sytuacjach, języku, sehnalová, liberalizacją, bankami,
Nearest to których: wypłacanych, dochodzeniem, której, 1999468, guantanamo, które, hugo, saharze,
Nearest to