In [0]:
import collections
import os
import pickle
import random
import urllib
from io import open
import numpy as np

In [0]:
!wget http://mattmahoney.net/dc/text8.zip

--2019-12-24 13:57:50--  http://mattmahoney.net/dc/text8.zip
Resolving mattmahoney.net (mattmahoney.net)... 67.195.197.75
Connecting to mattmahoney.net (mattmahoney.net)|67.195.197.75|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31344016 (30M) [application/zip]
Saving to: ‘text8.zip’


2019-12-24 13:58:34 (691 KB/s) - ‘text8.zip’ saved [31344016/31344016]



In [0]:
!unzip text8.zip

Archive:  text8.zip
  inflating: text8                   


In [0]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

Читаем данные

In [0]:
import nltk
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

def read_own_data(filename):
    """
    read your own data.
    :param filename:
    :return:
    """
    print('reading data...')
    with open(filename, 'r', encoding='utf-8') as f:
        data = f.read().split()
    print('corpus size', len(data))
    return data

In [0]:
def build_dataset(words, n_words):
    """
    build dataset
    :param words: corpus
    :param n_words: learn most common n_words
    :return:
        - data: [word_index]
        - count: [ [word_index, word_count], ]
        - dictionary: {word_str: word_index}
        - reversed_dictionary: {word_index: word_str}
    """
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # UNK index is 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


In [0]:
def noise(vocabs, word_count):
    """
    generate noise distribution
    :param vocabs:
    :param word_count:
    :return:
    """
    Z = 0.001
    unigram_table = []
    num_total_words = sum([c for w, c in word_count])
    for vo in vocabs:
        unigram_table.extend([vo] * int(((word_count[vo][1]/num_total_words)**0.75)/Z))

    print("vocabulary size", len(vocabs))
    print("unigram_table size:", len(unigram_table))
    return unigram_table

In [0]:
class DataPipeline:
    def __init__(self, data, vocabs, word_count, data_index=0, use_noise_neg=True):
        self.data = data
        self.data_index = data_index
        if use_noise_neg:
            self.unigram_table = noise(vocabs, word_count)
        else:
            self.unigram_table = vocabs

    def get_neg_data(self, batch_size, num, target_inputs):
        """
        sample the negative data. Don't use np.random.choice(), it is very slow.
        :param batch_size: int
        :param num: int
        :param target_inputs: []
        :return:
        """
        neg = np.zeros((num))
        for i in range(batch_size):
            delta = random.sample(self.unigram_table, num)
            while target_inputs[i] in delta:
                delta = random.sample(self.unigram_table, num)
            neg = np.vstack([neg, delta])
        return neg[1: batch_size + 1]

    def generate_batch(self, batch_size, num_skips, skip_window):
        """
        get the data batch
        :param batch_size:
        :param num_skips:
        :param skip_window:
        :return: target batch and context batch
        """
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size), dtype=np.int32)
        span = 2 * skip_window + 1  # [ skip_window, target, skip_window ]
        buffer = collections.deque(maxlen=span)
        for _ in range(span):
            buffer.append(self.data[self.data_index])
            self.data_index = (self.data_index + 1) % len(self.data)
        for i in range(batch_size // num_skips):
            target = skip_window
            targets_to_avoid = [skip_window]
            for j in range(num_skips):
                while target in targets_to_avoid:
                    target = random.randint(0, span - 1)
                targets_to_avoid.append(target)
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j] = buffer[target]
            buffer.append(self.data[self.data_index])
            self.data_index = (self.data_index + 1) % len(self.data)
        self.data_index = (self.data_index + len(self.data) - span) % len(self.data)
        return batch, labels

In [0]:
import torch
from torch import nn


class SkipGramNeg(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super(SkipGramNeg, self).__init__()
        self.input_emb = nn.Embedding(vocab_size, emb_dim)
        self.output_emb = nn.Embedding(vocab_size, emb_dim)
        self.log_sigmoid = nn.LogSigmoid()

        initrange = (2.0 / (vocab_size + emb_dim)) ** 0.5  # Xavier init
        self.input_emb.weight.data.uniform_(-initrange, initrange)
        self.output_emb.weight.data.uniform_(-0, 0)


    def forward(self, target_input, context, neg):
        """
        :param target_input: [batch_size]
        :param context: [batch_size]
        :param neg: [batch_size, neg_size]
        :return:
        """
        # u,v: [batch_size, emb_dim]
        v = self.input_emb(target_input)
        u = self.output_emb(context)
        # positive_val: [batch_size]
        positive_val = self.log_sigmoid(torch.sum(u * v, dim=1)).squeeze()

        # u_hat: [batch_size, neg_size, emb_dim]
        u_hat = self.output_emb(neg)
        # [batch_size, neg_size, emb_dim] x [batch_size, emb_dim, 1] = [batch_size, neg_size, 1]
        # neg_vals: [batch_size, neg_size]
        neg_vals = torch.bmm(u_hat, v.unsqueeze(2)).squeeze(2)
        # neg_val: [batch_size]
        neg_val = self.log_sigmoid(-torch.sum(neg_vals, dim=1)).squeeze()

        loss = positive_val + neg_val
        return -loss.mean()

    def predict(self, inputs):
        return self.input_emb(inputs)

In [0]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [0]:
import os
import random
import torch
from torch.optim import SGD
from tqdm import tqdm



class Word2Vec:
    def __init__(self, data_path, vocabulary_size, embedding_size, learning_rate=1.0):

        self.corpus = read_own_data(data_path)

        self.data, self.word_count, self.word2index, self.index2word = build_dataset(self.corpus,
                                                                    vocabulary_size)
        self.vocabs = list(set(self.data))

        self.model: SkipGramNeg = SkipGramNeg(vocabulary_size, embedding_size).to(device)
        self.model_optim = SGD(self.model.parameters(), lr=learning_rate)


    def train(self, train_steps=4000, skip_window=2, num_skips=2, num_neg=7, batch_size=256, data_offest=0, vali_size=3, output_dir='out'):
        self.outputdir = os.mkdir(output_dir)

        avg_loss = 0
        pipeline = DataPipeline(self.data, self.vocabs ,self.word_count, data_offest)
        vali_examples = random.sample(self.vocabs, vali_size)
        losses = []
        progress_bar = tqdm(total=train_steps, desc='How many steps were done')

        for step in range(train_steps):
            batch_inputs, batch_labels = pipeline.generate_batch(batch_size, num_skips, skip_window)
            batch_neg = pipeline.get_neg_data(batch_size, num_neg, batch_inputs)

            batch_inputs = torch.tensor(batch_inputs, dtype=torch.long).to(device)
            batch_labels = torch.tensor(batch_labels, dtype=torch.long).to(device)
            batch_neg = torch.tensor(batch_neg, dtype=torch.long).to(device)

            loss = self.model(batch_inputs, batch_labels, batch_neg)
            self.model_optim.zero_grad()
            loss.backward()
            self.model_optim.step()

            avg_loss += loss.item()
            losses.append(loss.item())
            progress_bar.set_postfix(train_loss = np.mean(losses[-500:]))

            progress_bar.update(1)

        # save model at last
        torch.save(self.model.state_dict(), self.outputdir + '/model_step%d.pt' % train_steps)

    def save_model(self, out_path):
        torch.save(self.model.state_dict(), out_path + '/model.pt')

    def get_list_vector(self):
        sd = self.model.state_dict()
        return sd['input_emb.weight'].tolist()

    def load_model(self, model_path):
        self.model.load_state_dict(torch.load(model_path))

    def vector(self, index):
        self.model.predict(index)

In [0]:
word2vec = Word2Vec(data_path='text8',
                    vocabulary_size=200000,
                    embedding_size=300)
word2vec.train()

reading data...
corpus size 17005207


How many steps were done:   0%|          | 0/4000 [00:00<?, ?it/s]

vocabulary size 200000
unigram_table size: 2821


How many steps were done: 100%|██████████| 4000/4000 [31:17<00:00,  2.13it/s, train_loss=0.937]

TypeError: ignored

In [0]:
for instance in list(tqdm._instances):
    tqdm._decr_instances(instance)

In [0]:
!rmdir out

In [0]:
vectors = word2vec.get_list_vector()

In [0]:
!wget https://raw.githubusercontent.com/tmikolov/word2vec/master/questions-words.txt

--2019-12-24 14:42:06--  https://raw.githubusercontent.com/tmikolov/word2vec/master/questions-words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 603955 (590K) [text/plain]
Saving to: ‘questions-words.txt’


2019-12-24 14:42:06 (27.8 MB/s) - ‘questions-words.txt’ saved [603955/603955]



In [0]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Training times in seconds

def print_accuracy(model, questions_file):
    print('Evaluating...\n')
    acc = model.accuracy(questions_file)

    sem_correct = sum((len(acc[i]['correct']) for i in range(5)))
    sem_total = sum((len(acc[i]['correct']) + len(acc[i]['incorrect'])) for i in range(5))
    sem_acc = 100*float(sem_correct)/sem_total
    print('\nSemantic: {:d}/{:d}, Accuracy: {:.2f}%'.format(sem_correct, sem_total, sem_acc))
    
    syn_correct = sum((len(acc[i]['correct']) for i in range(5, len(acc)-1)))
    syn_total = sum((len(acc[i]['correct']) + len(acc[i]['incorrect'])) for i in range(5,len(acc)-1))
    syn_acc = 100*float(syn_correct)/syn_total
    print('Syntactic: {:d}/{:d}, Accuracy: {:.2f}%\n'.format(syn_correct, syn_total, syn_acc))
    return (sem_acc, syn_acc)

word_analogies_file = 'questions-words.txt'
accuracies = []

In [0]:
print('Accuracy for Word2Vec:')
accuracies.append(print_accuracy(model, word_analogies_file))

  
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-12-24 14:46:02,883 : INFO : precomputing L2-norms of word weight vectors


Accuracy for Word2Vec:
Evaluating...



  if np.issubdtype(vec.dtype, np.int):
2019-12-24 14:46:06,319 : INFO : capital-common-countries: 0.0% (0/506)
2019-12-24 14:46:15,286 : INFO : capital-world: 0.0% (0/1452)
2019-12-24 14:46:16,959 : INFO : currency: 0.0% (0/268)
2019-12-24 14:46:26,837 : INFO : city-in-state: 0.0% (0/1571)
2019-12-24 14:46:28,785 : INFO : family: 0.0% (0/306)
2019-12-24 14:46:33,522 : INFO : gram1-adjective-to-adverb: 0.0% (0/756)
2019-12-24 14:46:35,476 : INFO : gram2-opposite: 0.0% (0/306)
2019-12-24 14:46:43,370 : INFO : gram3-comparative: 0.0% (0/1260)
2019-12-24 14:46:46,592 : INFO : gram4-superlative: 0.0% (0/506)
2019-12-24 14:46:52,898 : INFO : gram5-present-participle: 0.0% (0/992)
2019-12-24 14:47:01,525 : INFO : gram6-nationality-adjective: 0.1% (1/1371)
2019-12-24 14:47:09,911 : INFO : gram7-past-tense: 0.0% (0/1332)
2019-12-24 14:47:16,146 : INFO : gram8-plural: 0.0% (0/992)
2019-12-24 14:47:20,201 : INFO : gram9-plural-verbs: 0.0% (0/650)
2019-12-24 14:47:20,206 : INFO : total: 0.0% (1/12


Semantic: 0/4103, Accuracy: 0.00%
Syntactic: 1/8165, Accuracy: 0.01%



In [0]:
import pandas as pd
semeval = pd.read_csv('semeval.csv', sep=',')
semeval.head()

Unnamed: 0.1,Unnamed: 0,type,word1,word2,word3,target
0,0,2c,water,drop,hour,seconds
1,1,2c,mile,yard,hour,seconds
2,2,2c,time,moment,hour,seconds
3,3,2c,water,drop,feet,inches
4,4,2c,mile,yard,feet,inches


In [0]:
word2vec.index2word

300000

In [0]:
vec = open('vectors.txt', 'w')
vec.write(str(len(vectors))+' 300\n')
for indx, vector in enumerate(vectors):
  line = word2vec.index2word[indx]+' '
  for i in vector:
    line+=str(i)+' '
  if indx == (len(vectors)-1):
    print('last')
    vec.write(line[:-2])
  else:
    vec.write(line[:-2]+'\n')

last


In [0]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('vectors.txt', binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
model.most_similar(positive=['king', 'woman'], negative=['man'], topn=15)

  if np.issubdtype(vec.dtype, np.int):


[('for', 0.9849818348884583),
 ('on', 0.9849700927734375),
 ('that', 0.9849443435668945),
 ('to', 0.9848905801773071),
 ('is', 0.9848145246505737),
 ('and', 0.9847832322120667),
 ('by', 0.9847445487976074),
 ('of', 0.9847289323806763),
 ('wa', 0.9846924543380737),
 ('with', 0.9846853613853455),
 ('country', 0.9846317768096924),
 ('ha', 0.9846177101135254),
 ('which', 0.9845396876335144),
 ('from', 0.9845099449157715),
 ('in', 0.9844862222671509)]

In [0]:
for instance in list(tqdm._instances):
    tqdm._decr_instances(instance)

In [0]:
s = 0
j = 0
for i in tqdm(range(semeval.shape[0])):
  a, b, c = semeval['word1'][i], semeval['word3'][i], semeval['word2'][i]
  try:
    res = model.most_similar(positive=[a, b], negative=[ c ], topn=10)
    words = [i for i,j in res]
    if semeval['target'][i] in words:
      s+=1
  except KeyError:
    j+=1
    continue

print(s/(semeval.shape[0]-j))

  if np.issubdtype(vec.dtype, np.int):
100%|██████████| 10014/10014 [05:56<00:00, 23.87it/s]

0.0007883770694898074





In [0]:
s

7

In [0]:
j

1185