### 使用gensim训练word2vec

In [1]:
import logging
import random
import numpy as np
import pandas as pd
import torch

# 日志？?
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x1e4001f9910>

#### 将训练集数据进行10折划分，保持类别的分布一致（代码拆开）

In [2]:
n_fold = 10
n_example = 10000
data_file = './train_set.csv.zip'
train = pd.read_csv(data_file, sep='\t')[:n_example]

In [3]:
# 数据直接全局变量
train_texts = train['text'].tolist()
train_labels = train['label'].tolist()
n_total = len(train_labels)

In [4]:
def all_data2index(fold_num):
    """读取df，将series转换为list进行处理"""
        
    # 1.所有数据打乱，通过打乱列表的索引来实现 / 通过sklearn的shuffle模块？
    index = list(range(n_total))
    np.random.shuffle(index)
    all_texts = []
    all_labels = []
    for i in index:
        all_texts.append(train_texts[i])
        all_labels.append(train_labels[i])
    
    # 2.将所有数据按照类别进行划分，通过索引实现：字典检查某个键是否存在，不存在，就创建列表，存在则往列表里添加
    label2id = {}
    for i in range(n_total):
        label = str(all_labels[i])
        # 字典检查某个键是否存在？？ 不加.keys?
        if label not in label2id:
            label2id[label] = [i]
        else:
            label2id[label].append(i)
    
    # 3.
    all_index = [[] for _ in range(fold_num)]
    for label, data in label2id.items():
        # print(label, len(data))
        batch_size = int(len(data) / fold_num)
        other = len(data) - batch_size * fold_num
        
        # 对每个类别都进行10折划分
        for i in range(fold_num):
            # if判断用于赋值
            
            # ？
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            
            # 总共包含10个列表，每个列表都包含所有类别的数据
            all_index[i].extend(batch_data)
            # 等价于 all_index = [], all_index.append(batch_data)
            
            
    return all_texts, all_labels, all_index

In [5]:
all_texts, all_labels, all_index = all_data2index(n_fold)

In [6]:
def index2fold_data(all_texts, all_labels, all_index, fold_num):
    """这里的 texts、labels 是 fold_texts、fold_labels"""
    
    all_fold_data = []   
    
    # 4.根据每折的索引 划分出每折的数据，然后打乱
    # 平均每折的数据量    
    batch_size = int(n_total / fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        # 每折的数据量
        num = len(all_index[fold])
        # 从所有数据索引中 索引出 每折数据 对应的text和label的索引
        texts = [all_texts[i] for i in all_index[fold]]
        labels = [all_labels[i] for i in all_index[fold]]
        
        # 如果每折的数据量 > 平均每折的数据量，对该折的数据进行缩减，只取到平均每折的数据量
        if num > batch_size:
            fold_texts = texts[:batch_size]
            fold_labels = labels[:batch_size]           
            other_texts.extend(texts[batch_size:])
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
            
        # 如果每折的数据量 < 平均每折的数据量，则将上折剩余的数据补充到该折数据（列表的加法），直到取到平均每折的数据量
        elif num < batch_size:
            end = start + batch_size - num
            # 如果上折剩余的数据量不足以补充该折数据呢，索引就会报错啊？？？？？？？？？？？？？？？
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            # 前面被补充过的数据不再使用
            start = end
        
        # 如果每折的数据量 = 平均每折的数据量，该折的数据进行缩减，只取到平均
        else:
            fold_texts = texts
            fold_labels = labels
        
        # 确保每折的数据量都等同于 平均每折的数据量
        assert batch_size == len(fold_labels)
    
    # 那多出来的数据呢？？？？？？？？？？？？？？？？？？？？？
    
        # 对该折的数据进行打乱，通过列表的索引
        fold_index = list(range(batch_size))
        np.random.shuffle(fold_index)
        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in fold_index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])
        
        # 将每折数据添加到 总划分数据里
        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        all_fold_data.append(data)
    
    # 记录输出 十折划分后 每折的数据量？？？
    logging.info("Fold lens %s", str([len(fold_data['label']) for fold_data in all_fold_data]))

    return all_fold_data

In [7]:
all_fold_datas = index2fold_data(all_texts, all_labels, all_index, n_fold)

2020-07-24 09:51:09,683 INFO: Fold lens [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]


In [8]:
print(len(all_fold_datas), '\n')

all_folds = {}
for i, fold_data in enumerate(all_fold_datas):
    print(len(fold_data['label']), len(fold_data['text']))
    all_folds['fold_' + str(i)] = pd.Series(fold_data['label']).value_counts() / len(fold_data['label'])

10 

1000 1000
1000 1000
1000 1000
1000 1000
1000 1000
1000 1000
1000 1000
1000 1000
1000 1000
1000 1000


查看每折的数据的数量和类别分布是否一致，**第一折的数据中没有类别13**  

为什么分布结果和上个（没拆开，不是随机打乱数据？）一致？？？

In [9]:
pd.DataFrame(all_folds)

Unnamed: 0,fold_0,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9
0,0.189,0.189,0.189,0.189,0.189,0.188,0.188,0.188,0.188,0.188
1,0.187,0.187,0.187,0.187,0.187,0.187,0.186,0.186,0.186,0.186
2,0.157,0.156,0.156,0.156,0.156,0.156,0.156,0.156,0.156,0.156
3,0.108,0.108,0.108,0.108,0.108,0.108,0.108,0.108,0.107,0.107
4,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.079,0.078,0.078
5,0.062,0.062,0.062,0.062,0.062,0.062,0.062,0.062,0.062,0.061
6,0.051,0.051,0.051,0.051,0.051,0.05,0.05,0.05,0.05,0.05
7,0.043,0.043,0.043,0.043,0.043,0.043,0.042,0.042,0.042,0.042
8,0.042,0.042,0.042,0.042,0.042,0.041,0.041,0.041,0.041,0.041
9,0.031,0.031,0.031,0.031,0.031,0.031,0.031,0.031,0.031,0.031


#### build train data for word2vec

In [10]:
fold_id = 9

train_texts = []
for i in range(fold_id):
    data = all_fold_datas[i]
    train_texts.extend(data['text'])
    
logging.info('Total %d docs.' % len(train_texts))

2020-07-24 09:52:54,378 INFO: Total 9000 docs.


In [11]:
type(train_texts)

list

In [12]:
map(lambda x: list(x.split()), train_texts[:2])

<map at 0x1e402a820f0>

In [17]:
# list可视化
a = list(map(lambda x: list(x.split()), train_texts[:2]))
len(a), a

(2,
 [['2109',
   '932',
   '3335',
   '7261',
   '3659',
   '3370',
   '4464',
   '4464',
   '1519',
   '2716',
   '1970',
   '1363',
   '5519',
   '3266',
   '6862',
   '4933',
   '1080',
   '6122',
   '6050',
   '299',
   '2786',
   '7495',
   '2435',
   '4568',
   '5915',
   '134',
   '2465',
   '4464',
   '4464',
   '2073',
   '3659',
   '6065',
   '4853',
   '2087',
   '6286',
   '3750',
   '932',
   '2848',
   '2444',
   '3155',
   '3772',
   '3335',
   '7261',
   '3659',
   '3370',
   '4464',
   '4464',
   '1519',
   '2716',
   '1970',
   '1363',
   '5519',
   '3266',
   '6862',
   '4933',
   '2490',
   '1080',
   '6122',
   '6050',
   '299',
   '2786',
   '4648',
   '6122',
   '1906',
   '7160',
   '4480',
   '299',
   '6630',
   '3500',
   '3523',
   '6093',
   '5330',
   '299',
   '2786',
   '4499',
   '7010',
   '900',
   '2490',
   '2716',
   '1970',
   '3659',
   '3370',
   '4464',
   '4464',
   '1519',
   '1363',
   '5519',
   '3266',
   '6862',
   '4933',
   '3335',
   

In [24]:
logging.info('Start training...')
from gensim.models.word2vec import Word2Vec

num_features = 100     # Word vector dimensionality
num_workers = 3      # Number of threads to run in parallel

# 对train_texts中的每个元素都运用该自定义函数，list使得map对象 可被展示
train_texts = list(map(lambda x: list(x.split()), train_texts))
model = Word2Vec(train_texts, workers=num_workers, size=num_features)
model.init_sims(replace=True)

# save model
model.save("./word2vec.bin")

2020-07-24 10:05:37,016 INFO: Start training...
2020-07-24 10:05:39,427 INFO: collecting all words and their counts
2020-07-24 10:05:39,427 INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-24 10:05:40,449 INFO: collected 5295 word types from a corpus of 8191447 raw words and 9000 sentences
2020-07-24 10:05:40,449 INFO: Loading a fresh vocabulary
2020-07-24 10:05:40,521 INFO: effective_min_count=5 retains 4335 unique words (81% of original 5295, drops 960)
2020-07-24 10:05:40,522 INFO: effective_min_count=5 leaves 8189498 word corpus (99% of original 8191447, drops 1949)
2020-07-24 10:05:40,533 INFO: deleting the raw counts dictionary of 5295 items
2020-07-24 10:05:40,534 INFO: sample=0.001 downsamples 61 most-common words
2020-07-24 10:05:40,534 INFO: downsampling leaves estimated 7070438 word corpus (86.3% of prior 8189498)
2020-07-24 10:05:40,544 INFO: estimated required memory for 4335 words and 100 dimensions: 5635500 bytes
2020-07-24 10:05:40,545 INF

#### 加载word2vec模型

In [25]:
# load model
model = Word2Vec.load("./word2vec.bin")

# convert format
model.wv.save_word2vec_format('./word2vec.txt', binary=False)

2020-07-24 10:07:52,255 INFO: loading Word2Vec object from ./word2vec.bin
2020-07-24 10:07:52,299 INFO: loading wv recursively from ./word2vec.bin.wv.* with mmap=None
2020-07-24 10:07:52,299 INFO: setting ignored attribute vectors_norm to None
2020-07-24 10:07:52,300 INFO: loading vocabulary recursively from ./word2vec.bin.vocabulary.* with mmap=None
2020-07-24 10:07:52,300 INFO: loading trainables recursively from ./word2vec.bin.trainables.* with mmap=None
2020-07-24 10:07:52,301 INFO: setting ignored attribute cum_table to None
2020-07-24 10:07:52,301 INFO: loaded ./word2vec.bin
2020-07-24 10:07:52,312 INFO: storing 4335x100 projection weights into ./word2vec.txt
