## 下载wiki语料
https://dumps.wikimedia.org/zhwiki/20190720/

## 使用解压wiki语料包的工具
https://github.com/attardi/wikiextractor

## 繁体转化简体

In [None]:
!tail dataset/wiki_01

In [18]:
!curl -o langconv.py https://raw.githubusercontent.com/skydark/nstools/master/zhtools/langconv.py
!curl -o zh_wiki.py https://raw.githubusercontent.com/skydark/nstools/master/zhtools/zh_wiki.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  7961  100  7961    0     0   2915      0  0:00:02  0:00:02 --:--:--  2913
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  139k  100  139k    0     0   4100      0  0:00:34  0:00:34 --:--:--  4236      0  0:00:36  0:00:34  0:00:02  5374


In [12]:
from langconv import Converter
import jieba

IN_FILE_PATHS = ['dataset/wiki_00', 'dataset/wiki_01']
OUT_SUFFIX = '_out'

def _count_file_lines(file_path):
    lines = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for index, line in enumerate(file):
            lines += 1

def convert_to_simplified_style_and_cut(in_file_path, out_file_path):
    out_file = open(out_file_path, 'a', encoding='utf-8')
     
    lines = _count_file_lines(in_file_path)
    print('lines=',lines)
    
    with tqdm(total=lines) as pbar:
        with open(in_file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    line = Converter('zh-hans').convert(line)
                    token_list = jieba.cut(line)
                    new_line = ' '.join(token_list)
                    out_file.write(new_line)
                    out_file.write('\n')
                    pbar.update(1)

for ipath in IN_FILE_PATHS:
    opath = ipath + OUT_SUFFIX
    convert_to_simplified_style_and_cut(ipath, opath)


  0%|          | 139/11913599 [00:00<2:24:32, 1373.77it/s]

lines= 11913599


100%|██████████| 11913599/11913599 [1:20:18<00:00, 2472.52it/s]


In [14]:
!tail dataset/wiki_01_out



< / doc >
< doc   id = " 6666866 "   url = " https : / / zh . wikipedia . org / wiki ? curid = 6666866 "   title = " 麦 可 · 柏 辛格 " >
麦 可 · 柏 辛格

麦 可 · 柏 辛格 （ Mike   Bolsinger ， ）   ， 为 美国 的 棒球 选手 之一 ， 于于 2010 年 美国 职棒 选秀 为 亚利桑那 响尾蛇 选进 ， 曾于 2015 - 2016 年 二个 球季 于 洛杉矶 道奇 上过 大 联盟 。 2018 年 球季 效力 于 日本 职棒 千叶 罗德 海洋 队 ， 守备 位置 为 投手 。


< / doc >


## 训练词向量模型

In [35]:
import os
import re
import gensim
from tqdm import tqdm

COUNTS=5728930

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
        self.line_filter_pattern = re.compile('<.*>')
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):

            if 'out' not in fname:
                continue
            print('train:',fname)
            
            with tqdm(total=COUNTS) as pbar:
                for line in open(os.path.join(self.dirname, fname)):
                    line = line.strip()
                    if line=="" or self.line_filter_pattern.match(line):
                        continue
                    yield line.split()
                    pbar.update(1)


sentences = MySentences('dataset') # a memory-friendly iterator

# counts = 0
# for i,s in enumerate(sentences):
#     counts+=1
# print('counts=',counts)


In [36]:
model = gensim.models.Word2Vec(sentences)

  0%|          | 5355/5728930 [00:00<03:32, 26901.10it/s]

train: wiki_00_out


 76%|███████▌  | 4338861/5728930 [01:31<00:29, 47345.81it/s] 
  0%|          | 9774/5728930 [00:00<01:57, 48512.44it/s]

train: wiki_01_out


 24%|██▍       | 1390069/5728930 [00:28<01:29, 48475.42it/s]
  0%|          | 1286/5728930 [00:00<07:25, 12859.61it/s]

train: wiki_00_out


 76%|███████▌  | 4338861/5728930 [03:45<01:12, 19203.38it/s]
  0%|          | 2583/5728930 [00:00<04:02, 23608.74it/s]

train: wiki_01_out


 24%|██▍       | 1390069/5728930 [00:58<03:03, 23655.05it/s]
  0%|          | 1826/5728930 [00:00<05:15, 18124.80it/s]

train: wiki_00_out


 76%|███████▌  | 4338861/5728930 [03:37<01:09, 19907.37it/s]
  0%|          | 2367/5728930 [00:00<04:35, 20791.80it/s]

train: wiki_01_out


 24%|██▍       | 1390069/5728930 [01:05<03:25, 21128.19it/s]
  0%|          | 1930/5728930 [00:00<05:20, 17883.47it/s]

train: wiki_00_out


 76%|███████▌  | 4338861/5728930 [03:45<01:12, 19262.19it/s]
  0%|          | 2367/5728930 [00:00<04:16, 22313.54it/s]

train: wiki_01_out


 24%|██▍       | 1390069/5728930 [01:07<03:32, 20451.71it/s]
  0%|          | 1664/5728930 [00:00<05:46, 16524.70it/s]

train: wiki_00_out


 76%|███████▌  | 4338861/5728930 [03:44<01:11, 19347.43it/s]
  0%|          | 2367/5728930 [00:00<04:11, 22742.75it/s]

train: wiki_01_out


 24%|██▍       | 1390069/5728930 [01:07<03:32, 20448.63it/s]
  0%|          | 1670/5728930 [00:00<05:47, 16461.78it/s]

train: wiki_00_out


 76%|███████▌  | 4338861/5728930 [03:41<01:10, 19588.91it/s]
  0%|          | 2168/5728930 [00:00<04:28, 21300.21it/s]

train: wiki_01_out


 24%|██▍       | 1390069/5728930 [01:09<03:37, 19984.48it/s]


In [37]:
model.save('task-4-model')

## 加载模型测试同义词

In [1]:
import gensim
load_model = gensim.models.Word2Vec.load('task-4-model')

In [2]:
load_model.most_similar('记者')

  """Entry point for launching an IPython kernel.


[('新闻记者', 0.8838781118392944),
 ('摄影记者', 0.8573684096336365),
 ('美联社', 0.7827507257461548),
 ('法新社', 0.7473775148391724),
 ('路透社', 0.7413724660873413),
 ('美国之音', 0.7159520387649536),
 ('CNN', 0.7129102945327759),
 ('评论员', 0.7102958559989929),
 ('专栏作家', 0.7060286998748779),
 ('采访记者', 0.6976608037948608)]

## 词向量可视化

In [5]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    print('step1')
    with tqdm(total=len(load_model.wv.vocab)) as pbar:
        for word in model.wv.vocab:
            tokens.append(model[word])
            labels.append(word)
            pbar.update(1)
        
    #此处非常慢
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=250, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    print('step2')
    with tqdm(total=len(new_values)) as pbar:
        for value in new_values:
            x.append(value[0])
            y.append(value[1])
            pbar.update(1)
        
    plt.figure(figsize=(16, 16)) 
    print('step3')
    with tqdm(total=len(x)) as pbar:
        for i in range(len(x)):
            plt.scatter(x[i],y[i])
            plt.annotate(labels[i],
                         xy=(x[i], y[i]),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')
            pbar.update(1)
    plt.show()
    

In [None]:
tsne_plot(load_model)

  from ipykernel import kernelapp as app
  3%|▎         | 17780/695808 [00:00<00:03, 177796.35it/s]

step1


100%|██████████| 695808/695808 [00:03<00:00, 180999.99it/s]
