In [2]:
import sys
import ast
import numpy as np


def parse_array(s):
    return np.array(ast.literal_eval(s))

def read_array():
    return parse_array(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def generate_w2v_sgns_samples(text, window_size, vocab_size, ns_rate):
    """
    text - list of integer numbers - ids of tokens in text
    window_size - odd integer - width of window
    vocab_size - positive integer - number of tokens in vocabulary
    ns_rate - positive integer - number of negative tokens to sample per one positive sample

    returns list of training samples (CenterWord, CtxWord, Label)
    """
    w_side = int((window_size - 1) / 2)
    train = []
    for i in range(w_side, len(text) - w_side+1):
        center = text[i]
        for cxtWord in text[i - w_side:i] + text[i + 1:i+w_side + 2]:
            train.append([center, cxtWord, 1])
            for neg in range(ns_rate):
                r = np.random.randint(vocab_size)
                train.append([center, r, 0])
    return train


text = read_array()
window_size = int(sys.stdin.readline().strip())
vocab_size = int(sys.stdin.readline().strip())
ns_rate = int(sys.stdin.readline().strip())

result = generate_w2v_sgns_samples(text, window_size, vocab_size, ns_rate)

write_array(np.array(result))

SyntaxError: unexpected EOF while parsing (<unknown>, line 0)

In [3]:
import sys
import ast
import numpy as np


def parse_array(s):
    return np.array(ast.literal_eval(s))

def read_array():
    return parse_array(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def update_w2v_weights(center_embeddings, context_embeddings, center_word, context_word, label, learning_rate):
    """
    center_embeddings - VocabSize x EmbSize
    context_embeddings - VocabSize x EmbSize
    center_word - int - identifier of center word
    context_word - int - identifier of context word
    label - 1 if context_word is real, 0 if it is negative
    learning_rate - float > 0 - size of gradient step
    """
    sigm = lambda x: 1 / (1 + np.exp(-x))
    center_word_emb = center_embeddings[center_word]
    context_word_emb = context_embeddings[context_word]
    prob = sigm(center_word_emb @ context_word_emb)
    err = prob - label
    w_grad = err * context_word_emb
    d_grad = err * center_word_emb
    
    center_embeddings[center_word] -= learning_rate * w_grad
    context_embeddings[context_word] -= learning_rate * d_grad
    return center_embeddings, context_embeddings
    

center_embeddings = read_array()
context_embeddings = read_array()
center_word = int(sys.stdin.readline().strip())
context_word = int(sys.stdin.readline().strip())
label = int(sys.stdin.readline().strip())
learning_rate = float(sys.stdin.readline().strip())

update_w2v_weights(center_embeddings, context_embeddings,
                   center_word, context_word, label, learning_rate)

write_array(center_embeddings)
write_array(context_embeddings)

SyntaxError: unexpected EOF while parsing (<unknown>, line 0)

In [None]:
import sys
import ast
import numpy as np


def read_list():
    return ast.literal_eval(sys.stdin.readline())

def parse_array(s):
    return np.array(ast.literal_eval(s))

def read_array():
    return parse_array(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def generate_ft_sgns_samples(text, window_size, vocab_size, ns_rate, token2subwords):
    """
    text - list of integer numbers - ids of tokens in text
    window_size - odd integer - width of window
    vocab_size - positive integer - number of tokens in vocabulary
    ns_rate - positive integer - number of negative tokens to sample per one positive sample
    token2subwords - list of lists of int - i-th sublist contains list of identifiers of n-grams for token #i (list of subword units)

    returns list of training samples (CenterSubwords, CtxWord, Label)
    """
    w_side = (window_size - 1) // 2
    train = []
    text = text.tolist()
    for i in range(len(text)):
        center = text[i]

        left_idx = max(0, i - w_side)
        left = text[left_idx:i]
        right_idx = min(len(text), i+w_side+1)
        right = text[i+1:right_idx]
        
        ctx = left + right
        #print(len(ctx))
        for cxtWord in ctx:
            v = [center] + token2subwords[center]
            train.append((v, cxtWord, 1))
            for neg in range(ns_rate):
                r = np.random.randint(vocab_size)
                train.append((v, r, 0))
    return train


text = read_array()
window_size = int(sys.stdin.readline().strip())
vocab_size = int(sys.stdin.readline().strip())
ns_rate = int(sys.stdin.readline().strip())
token2subwords = read_list()

result = generate_ft_sgns_samples(text, window_size, vocab_size, ns_rate, token2subwords)

print(repr(result))

In [None]:
import sys
import ast
import numpy as np


def parse_array(s):
    return np.array(ast.literal_eval(s))

def read_array():
    return parse_array(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def update_ft_weights(center_embeddings, context_embeddings, center_subwords, context_word, label, learning_rate):
    """
    center_embeddings - VocabSize x EmbSize
    context_embeddings - VocabSize x EmbSize
    center_subwords - list of ints - list of identifiers of n-grams contained in center word
    context_word - int - identifier of context word
    label - 1 if context_word is real, 0 if it is negative
    learning_rate - float > 0 - size of gradient step
    """
    sigm = lambda x: 1 / (1 + np.exp(-x))

    center_word_emb = np.zeros((1, center_embeddings.shape[1]))
    n = len(center_subwords)
    for w in center_subwords:
        center_word_emb += center_embeddings[w]
    center_word_emb /= n

    context_word_emb = context_embeddings[context_word]
    prob = sigm(center_word_emb @ context_word_emb)
    err = prob - label
    w_grad = err * context_word_emb / n
    
    for w in center_subwords:
        center_embeddings[w] -= learning_rate * w_grad
    d_grad = err * center_word_emb

    context_embeddings[context_word] -= learning_rate * d_grad.squeeze()
    return center_embeddings, context_embeddings

center_embeddings = read_array()
context_embeddings = read_array()
center_subwords = read_array()
context_word = int(sys.stdin.readline().strip())
label = int(sys.stdin.readline().strip())
learning_rate = float(sys.stdin.readline().strip())

update_ft_weights(center_embeddings, context_embeddings,
                  center_subwords, context_word, label, learning_rate)

write_array(center_embeddings)
write_array(context_embeddings)

In [11]:
import sys
import ast
import numpy as np
import scipy.sparse


def read_array():
    return ast.literal_eval(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def generate_coocurrence_matrix(texts, vocab_size):
    """
    texts - list of lists of ints - i-th sublist contains identifiers of tokens in i-th document
    vocab_size - int - size of vocabulary
    returns scipy.sparse.dok_matrix
    """
    dok = scipy.sparse.dok_matrix((vocab_size, vocab_size))
    n = len(texts)
    for i in range(vocab_size):
        for j in range(vocab_size):
            if i == j:
                continue
            for text in texts:
                if i in text and j in text:
                    dok[i, j] += 1
    return dok

text = read_array()
vocab_size = int(sys.stdin.readline().strip())

result = generate_coocurrence_matrix(text, vocab_size)

write_array(result.toarray())

SyntaxError: unexpected EOF while parsing (<unknown>, line 0)

In [None]:
import sys
import ast
import numpy as np


def parse_array(s):
    return np.array(ast.literal_eval(s))

def read_array():
    return parse_array(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def update_glove_weights(x, w, d, alpha, max_x, learning_rate):
    """
    x - square integer matrix VocabSize x VocabSize - coocurrence matrix
    w - VocabSize x EmbSize - first word vectors
    d - VocabSize x EmbSize - second word vectors
    alpha - float - power in weight smoothing function f
    max_x - int - maximum coocurrence count in weight smoothing function f
    learning_rate - positive float - size of gradient step
    """
    f = lambda x: np.where(x <= max_x, (x / max_x) ** alpha, 1.0)
    fx = f(x)
    logx = np.log1p(x)
    wd = w @ d.T
    
    err = fx * (logx - wd)
    dw = (-2 * err) @ d
    dd = (-2 * err).T @ w
    w[:] = w - learning_rate * dw
    d[:] = d - learning_rate * dd

x = read_array()
w = read_array()
d = read_array()
alpha = float(sys.stdin.readline().strip())
max_x = int(sys.stdin.readline().strip())
learning_rate = float(sys.stdin.readline().strip())

update_glove_weights(x, w, d, alpha, max_x, learning_rate)

write_array(w)
write_array(d)

In [None]:
import sys
import ast
import numpy as np


def parse_array(s):
    return np.array(ast.literal_eval(s))

def read_array():
    return parse_array(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def get_nearest(embeddings, query_word_id, get_n):
    """
    embeddings - VocabSize x EmbSize - word embeddings
    query_word_id - integer - id of query word to find most similar to
    get_n - integer - number of most similar words to retrieve

    returns list of `get_n` tuples (word_id, similarity) sorted by descending order of similarity value
    """
    embbeddings /= np.linalg.norm(embeddings, ord=2, axis=0)
    word_emb = embeddings[query_word_id]
    distances = embeddings @ word_emb
    top_idx = np.argsort(distances)[::-1][:get_n]
    answer = []
    for idx in top_idx:
        answer.append((idx, distances[idx]))
    return answer


embeddings = read_array()
query_word_id = int(sys.stdin.readline().strip())
get_n = int(sys.stdin.readline().strip())

result = get_nearest(embeddings, query_word_id, get_n)

write_array(np.array(result))

In [14]:
5 * 64 * 128 + 128

41088