In [115]:
import os
import re
import logging
from gensim.models import word2vec

import jieba

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

home = os.path.expanduser('~')

raw_text_path = f'{home}/Documents/novel_data/raw/盘龙.txt'

process_text_path = f'{home}/Documents/novel_data/盘龙_cut.txt'


stop_words_path = f'{home}/Documents/novel_data/stop_words.txt'

stop_words = [w for w in load_stop_words() if re.match(r'\w', w)]


def load_stop_words():
    with open(stop_words_path, encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]


def remove_stop_words(words):
    return [w for w in words if w not in stop_words and w.strip()]


def remove_title(text):
    return re.sub('^第.*?章 .*\n', '', text)

    
def remove_special_sym(text):
    return re.sub('[^\n^A-Z^a-z^\u4e00-\u9fa5]', ' ', text)


def remove_redundancy_space(text):
    text = re.sub(' +', ' ', text)
    return re.sub('\n+', '\n', text)


def pre_process(text):
    text = remove_title(text)
    text = remove_special_sym(text)
    return text


def word2vec_from_file(path=process_text_path):
    sentences = word2vec.LineSentence(path)
    model = word2vec.Word2Vec(sentences,
                              hs=1,
                              min_count=1,
                              window=6,
                              size=100)
    return model

In [116]:
model = word2vec_from_file()

2020-07-07 01:54:48,770 : INFO : collecting all words and their counts
2020-07-07 01:54:48,772 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-07 01:54:48,923 : INFO : PROGRESS: at sentence #10000, processed 249233 words, keeping 25488 word types
2020-07-07 01:54:49,059 : INFO : PROGRESS: at sentence #20000, processed 470202 words, keeping 37556 word types
2020-07-07 01:54:49,210 : INFO : PROGRESS: at sentence #30000, processed 706284 words, keeping 48276 word types
2020-07-07 01:54:49,357 : INFO : PROGRESS: at sentence #40000, processed 939725 words, keeping 56533 word types
2020-07-07 01:54:49,504 : INFO : PROGRESS: at sentence #50000, processed 1163244 words, keeping 64080 word types
2020-07-07 01:54:49,648 : INFO : PROGRESS: at sentence #60000, processed 1389487 words, keeping 71271 word types
2020-07-07 01:54:49,799 : INFO : PROGRESS: at sentence #70000, processed 1635124 words, keeping 78462 word types
2020-07-07 01:54:49,857 : INFO : collected 

In [117]:
model.wv.most_similar('柯沃特')

2020-07-07 01:56:45,076 : INFO : precomputing L2-norms of word weight vectors


[('柯沃', 0.8462332487106323),
 ('爷爷', 0.824772298336029),
 ('克沃', 0.7402173280715942),
 ('用于', 0.7301793098449707),
 ('杖上', 0.7089070081710815),
 ('绿纹', 0.7068555951118469),
 ('愈早', 0.675171434879303),
 ('师了', 0.6642724871635437),
 ('林雷不行', 0.6627496480941772),
 ('装配', 0.659206748008728)]