In [17]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pprint import pprint as print

import sys
sys.path.append('../data_loader')

import MeCab
import gensim

import novel


In [18]:
class TokenizedDataLoader:
    def __init__(self, data_loader) -> None:
        self._data_loader = data_loader
        self._tagger = MeCab.Tagger('-d /opt/conda/lib/mecab/dic/mecab-ipadic-neologd -Owakati')
    
    def __iter__(self):
        for line in self._data_loader:
            yield self.tokenize(line)
        
    def tokenize(self, str):
        return self._tagger.parse(str).split()

In [19]:
data_loader = TokenizedDataLoader(novel.DataLoader('../data/narou'))

model = gensim.models.FastText()
model.build_vocab(corpus_iterable=data_loader)


# counter = 0
# for sentences in data_loader:
#     counter += 1
#     if counter > 10:
#         break
#     print(sentences)

2022-01-02 02:58:04,558 : INFO : FastText lifecycle event {'params': 'FastText(vocab=0, vector_size=100, alpha=0.025)', 'datetime': '2022-01-02T02:58:04.558454', 'gensim': '4.1.2', 'python': '3.8.12 | packaged by conda-forge | (default, Oct 12 2021, 21:59:51) \n[GCC 9.4.0]', 'platform': 'Linux-5.10.60.1-microsoft-standard-WSL2-x86_64-with-glibc2.10', 'event': 'created'}
2022-01-02 02:58:04,560 : INFO : collecting all words and their counts
2022-01-02 02:58:04,599 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-01-02 02:58:05,488 : INFO : PROGRESS: at sentence #10000, processed 108386 words, keeping 7135 word types
2022-01-02 02:58:06,339 : INFO : PROGRESS: at sentence #20000, processed 226389 words, keeping 10600 word types
2022-01-02 02:58:07,112 : INFO : PROGRESS: at sentence #30000, processed 337070 words, keeping 13407 word types
2022-01-02 02:58:07,912 : INFO : PROGRESS: at sentence #40000, processed 449202 words, keeping 16013 word types
2022-01-02

In [22]:
print(model.corpus_count)
print(model.epochs)
model.train(corpus_iterable=data_loader, total_examples=model.corpus_count, epochs=5)

2022-01-02 03:57:55,160 : INFO : FastText lifecycle event {'msg': 'training model with 3 workers on 210258 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-01-02T03:57:55.160459', 'gensim': '4.1.2', 'python': '3.8.12 | packaged by conda-forge | (default, Oct 12 2021, 21:59:51) \n[GCC 9.4.0]', 'platform': 'Linux-5.10.60.1-microsoft-standard-WSL2-x86_64-with-glibc2.10', 'event': 'train'}


38211069
None


2022-01-02 03:57:56,172 : INFO : EPOCH 1 - PROGRESS: at 0.02% examples, 67297 words/s, in_qsize 0, out_qsize 0
2022-01-02 03:57:57,206 : INFO : EPOCH 1 - PROGRESS: at 0.05% examples, 73370 words/s, in_qsize 0, out_qsize 0
2022-01-02 03:57:58,259 : INFO : EPOCH 1 - PROGRESS: at 0.08% examples, 76335 words/s, in_qsize 0, out_qsize 0
2022-01-02 03:57:59,357 : INFO : EPOCH 1 - PROGRESS: at 0.11% examples, 78079 words/s, in_qsize 0, out_qsize 0
2022-01-02 03:58:00,410 : INFO : EPOCH 1 - PROGRESS: at 0.14% examples, 77237 words/s, in_qsize 0, out_qsize 0
2022-01-02 03:58:01,493 : INFO : EPOCH 1 - PROGRESS: at 0.17% examples, 77315 words/s, in_qsize 0, out_qsize 0
2022-01-02 03:58:02,542 : INFO : EPOCH 1 - PROGRESS: at 0.19% examples, 78519 words/s, in_qsize 0, out_qsize 0
2022-01-02 03:58:03,568 : INFO : EPOCH 1 - PROGRESS: at 0.23% examples, 85256 words/s, in_qsize 0, out_qsize 0
2022-01-02 03:58:04,573 : INFO : EPOCH 1 - PROGRESS: at 0.26% examples, 90626 words/s, in_qsize 0, out_qsize 0
2

(1651461304, 2567534030)

In [30]:
from gensim.test.utils import get_tmpfile
from pathlib import Path

model_dir = Path('../models').resolve()
model_file_path = model_dir / 'fasttext/jpnovels.model'
print(model_file_path)

# fname = get_tmpfile('../models/fasttext/jpnovels.model')
model.save(str(model_file_path))
# model = gensim.models.FastText.load(fname)

2022-01-02 13:38:35,229 : INFO : FastText lifecycle event {'fname_or_handle': '/workspaces/pytorch-practice/models/fasttext/jpnovels.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-01-02T13:38:35.229344', 'gensim': '4.1.2', 'python': '3.8.12 | packaged by conda-forge | (default, Oct 12 2021, 21:59:51) \n[GCC 9.4.0]', 'platform': 'Linux-5.10.60.1-microsoft-standard-WSL2-x86_64-with-glibc2.10', 'event': 'saving'}
2022-01-02 13:38:35,230 : INFO : not storing attribute vectors
2022-01-02 13:38:35,231 : INFO : storing np array 'vectors_vocab' to /workspaces/pytorch-practice/models/fasttext/jpnovels.model.wv.vectors_vocab.npy


PosixPath('/workspaces/pytorch-practice/models/fasttext/jpnovels.model')


2022-01-02 13:38:35,844 : INFO : storing np array 'vectors_ngrams' to /workspaces/pytorch-practice/models/fasttext/jpnovels.model.wv.vectors_ngrams.npy
2022-01-02 13:38:42,123 : INFO : not storing attribute buckets_word
2022-01-02 13:38:42,324 : INFO : saved /workspaces/pytorch-practice/models/fasttext/jpnovels.model


In [37]:
word = '辞書に載ってない単語'
print(word in model.wv.key_to_index)
print(model.wv[word])

# '魔王' in model.wv.key_to_index

False
array([ 0.20234865,  0.19015083, -0.03907   , -0.10548971, -0.15955906,
       -0.04225236, -0.00135004, -0.04805985, -0.16450767,  0.0649535 ,
        0.05733949,  0.11774732, -0.08138777,  0.02105304,  0.08612979,
       -0.03201374, -0.12238283, -0.05169674,  0.09256034, -0.13538739,
        0.01642215,  0.00365097, -0.2436944 , -0.05707944,  0.18832046,
        0.1202508 , -0.09876697,  0.13335638, -0.04679824, -0.14218159,
        0.00359463, -0.02055242,  0.05410699,  0.19722319,  0.15952708,
        0.01882049, -0.0141467 , -0.1440396 ,  0.07095248,  0.04640691,
        0.21260825,  0.0525463 ,  0.11840938,  0.16349253,  0.15365279,
        0.02043385,  0.01482765, -0.08747672, -0.03242161, -0.00974826,
        0.02575356, -0.02766098,  0.18800311,  0.08577812, -0.20503153,
        0.02908729,  0.06556907,  0.110286  , -0.03516688, -0.14466716,
        0.17670138, -0.01383874,  0.17246062,  0.16430734, -0.2235949 ,
       -0.04561975, -0.13003004,  0.15513408,  0.0575178 ,

In [60]:
print(model.wv.most_similar(positive=['ゴブリン', '人間'], negative=['モンスター']))

[('鳥人間', 0.7889867424964905),
 ('棒人間', 0.7432736754417419),
 ('獣人', 0.7274637818336487),
 ('亜人', 0.7189075350761414),
 ('魔族', 0.6990464925765991),
 ('男', 0.6971792578697205),
 ('馬と人間', 0.6965796947479248),
 ('野武士', 0.6895461082458496),
 ('クソエルフ', 0.6858006119728088),
 ('武士', 0.6845240592956543)]


In [118]:
print(model.wv.most_similar(positive=['エルフ', '魔王'], negative=['勇者']))

[('ダークエルフ', 0.7977114319801331),
 ('エルフモドキ', 0.7838773727416992),
 ('エルフゾンビ', 0.7753736972808838),
 ('エルフ組', 0.7593133449554443),
 ('レズエルフ', 0.7592384815216064),
 ('エルダーエルフ', 0.7549557089805603),
 ('スキュラ', 0.7500725984573364),
 ('フォレ・エルフ', 0.7478818893432617),
 ('ブラックエルフ', 0.7415013313293457),
 ('竜人', 0.7412238121032715)]


In [119]:
print(model.wv.most_similar(positive=['エルフ', '勇者'], negative=['魔王']))

[('モブエルフ', 0.8534035086631775),
 ('エルフ組', 0.8086045384407043),
 ('ロリエルフ', 0.8055376410484314),
 ('ダークエルフ', 0.8004938960075378),
 ('ショタエルフ', 0.7986521124839783),
 ('レズエルフ', 0.7849956154823303),
 ('ハーフドワーフ', 0.7798111438751221),
 ('エリートエルフ', 0.7646079659461975),
 ('ハーフエルフ', 0.762889564037323),
 ('エルフメイド', 0.7497095465660095)]
