In [1]:
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")

In [2]:
type(model.wv.vectors)

numpy.ndarray

In [3]:
# 查看所有词向量组成的矩阵的形状
model.wv.vectors.shape

(16490, 300)

In [4]:
model.wv["flower"]

array([ 1.70418285e-02,  4.17075828e-02,  4.68027405e-02,  5.77561073e-02,
       -4.90124077e-02, -9.11914781e-02,  1.25736268e-02,  2.53656775e-01,
        2.55882926e-02, -9.93438065e-02, -3.88848074e-02,  5.98192168e-03,
        5.11968732e-02,  8.12970102e-02, -4.02569361e-02, -9.01165977e-03,
       -4.89391647e-02, -1.06224619e-01, -1.51742483e-03,  4.80879843e-02,
       -5.73582249e-03,  3.22901085e-02, -3.04702036e-02,  1.21747874e-01,
        1.15162851e-02, -1.89318955e-02,  3.02785505e-02, -1.88510306e-02,
       -1.93555709e-02,  1.73930470e-02,  8.91037136e-02, -3.36124338e-02,
        1.24498142e-03,  2.34751496e-03, -7.73967197e-03,  3.07026040e-02,
        3.07262540e-02, -1.02033511e-01, -1.09931326e-03,  9.13682282e-02,
       -4.12567668e-02,  3.09050386e-03, -1.20065305e-02, -2.85921004e-02,
       -2.22854987e-02,  5.53633608e-02,  6.29955754e-02,  4.77065556e-02,
        8.30905791e-03,  4.66902144e-02,  1.02785520e-01,  7.69525170e-02,
        1.48280161e-02, -

In [5]:
import numpy as np  

def makeFeatureVec(words, model, num_features):
    # 该函数用于计算段落中所有词向量的平均值
    #
    # 预先初始化一个空的 numpy 数组（为了速度）
    featureVec = np.zeros((num_features,), dtype="float32")
    #
    nwords = 0.
    # 
    # index2word 是一个包含模型词汇表中所有单词名称的列表。
    # 为了速度，将其转换为集合（set）。
    index2word_set = set(model.wv.index_to_key) # Note: .index2word is deprecated, .wv.index_to_key is the new syntax
    #
    # 遍历评论中的每个单词，如果它在模型的词汇表中，
    # 就将其特征向量加到总和中。
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model.wv[word]) # Note: model[word] is deprecated, model.wv[word] is the new syntax
    # 
    # 将结果除以单词总数，得到平均值。
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # 给定一组评论（每条评论都是一个单词列表），
    # 为每条评论计算平均特征向量，并返回一个二维 numpy 数组。
    # 
    # 初始化一个计数器
    counter = 0
    # 
    # 为了速度，预先分配一个二维 numpy 数组
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    # 
    # 遍历所有评论
    for review in reviews:
       #
       # 每处理 5000 条评论，打印一次状态信息
       if counter % 5000 == 0:
           print ("正在处理第 %d 条评论，共 %d 条" % (counter, len(reviews)))
       # 
       # 调用上面定义的函数来生成平均特征向量
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # 计数器加一
       counter = counter + 1
    return reviewFeatureVecs
# 将影评文本转换为单词列表
def review_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review, "lxml").get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

# 将影评分割成句子列表，每个句子又是一个单词列表
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [11]:
# ****************************************************************
# 使用我们上面定义的函数，为训练集和测试集计算平均特征向量。
# 注意，这次我们进行了停用词移除。
import pandas as pd
from bs4 import BeautifulSoup
train = pd.read_csv("../../tutorialData/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("../../tutorialData/testData.tsv", header=0, delimiter="\t", quoting=3)
num_features = 300    # 词向量维度
min_word_count = 40   # 最小词频
num_workers = 4       # 并行运行的线程数
context = 10          # 上下文窗口大小
downsampling = 1e-3   # 高频词的下采样设置
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print ("正在为测试评论创建平均特征向量")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

正在处理第 0 条评论，共 25000 条
正在处理第 5000 条评论，共 25000 条
正在处理第 10000 条评论，共 25000 条
正在处理第 15000 条评论，共 25000 条
正在处理第 20000 条评论，共 25000 条
正在为测试评论创建平均特征向量
正在处理第 0 条评论，共 25000 条
正在处理第 5000 条评论，共 25000 条
正在处理第 10000 条评论，共 25000 条
正在处理第 15000 条评论，共 25000 条
正在处理第 20000 条评论，共 25000 条


In [12]:
# 使用100棵决策树，拟合一个随机森林模型
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100 )

print ("正在将随机森林模型拟合到带标签的训练数据上...")
forest = forest.fit( trainDataVecs, train["sentiment"] )

# 进行测试并提取结果
result = forest.predict( testDataVecs )

# 写入测试结果
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )
print ("写入完成...")

正在将随机森林模型拟合到带标签的训练数据上...
写入完成...


In [14]:
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
import time

start = time.time() # 开始时间

# 将 "k" (聚类数量) 设置为词汇表大小的 1/5，
# 即平均每个聚类包含5个单词。
model = Word2Vec.load("300features_40minwords_10context")
word_vectors = model.wv.vectors 
num_clusters = word_vectors.shape[0] // 5

# 初始化一个KMeans对象，并用它来提取质心
# 注意：为了避免未来版本中的 FutureWarning，建议显式设置 n_init='auto'
kmeans_clustering = KMeans( n_clusters = num_clusters, n_init='auto' )
idx = kmeans_clustering.fit_predict( word_vectors )

# 获取结束时间，并打印整个过程所花费的时间
end = time.time()
elapsed = end - start
print ("KMeans 聚类耗时: ", elapsed, "秒。")

KMeans 聚类耗时:  105.08570528030396 秒。


In [15]:
# 创建一个字典，将词汇表中的每个单词映射到其所属的聚类编号
word_centroid_map = dict(zip( model.wv.index_to_key, idx ))

In [20]:
# 遍历前10个聚类
for cluster in range(0,10):
    #
    # 打印聚类编号
    print ("\nCluster %d" % cluster)
    #
    # 找出属于该聚类的所有单词，并打印出来
    words = []
    all_keys = list(word_centroid_map.keys())
    all_values = list(word_centroid_map.values())
    for i in range(0,len(all_values)):
        if( all_values[i] == cluster ):
            words.append(all_keys[i])
    print (words)


Cluster 0
['loyal', 'helpful', 'lifelong']

Cluster 1
['dangers', 'mythic', 'perils', 'feats', 'superficially', 'secretive']

Cluster 2
['elliott', 'nicolas', 'gibbs', 'denholm', 'baumbach']

Cluster 3
['inferno', 'ouch', 'heavenly', 'mercury', 'towering', 'fountain', 'stuntman', 'goliath']

Cluster 4
['mundane']

Cluster 5
['lingering', 'zoom', 'microphone']

Cluster 6
['got', 'saw', 'watched', 'bought', 'rented', 'purchased']

Cluster 7
['deadly', 'battling', 'havoc', 'deformed', 'backwoods', 'revolt', 'bloodthirsty', 'cannibalistic', 'lurks', 'hordes', 'terrorize', 'horde']

Cluster 8
['convince', 'teach', 'warn', 'haunt', 'inform', 'embarrass', 'announce', 'informing']

Cluster 9
['ringo', 'caligula', 'moog']


In [None]:
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # 聚类的数量等于单词/质心映射表中的最大聚类索引加一
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # 预先分配一个质心袋向量（为了提高速度）
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # 遍历评论中的单词。如果单词在词汇表中，
    # 就找到它所属的聚类，并将该聚类的计数加一。
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # 返回“质心袋”
    return bag_of_centroids

In [None]:
# 预先为训练集分配一个质心袋数组（为了提高速度）
train_centroids = np.zeros( (train["review"].size, num_clusters), \
    dtype="float32" )

# 将训练集的评论转换为质心袋
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

# 对测试评论重复同样的操作
test_centroids = np.zeros(( test["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

In [24]:
# 拟合随机森林模型并提取预测结果
forest = RandomForestClassifier(n_estimators = 100)

# 拟合森林模型可能需要几分钟时间
print ("正在将随机森林模型拟合到带标签的训练数据上...")
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

# 写入测试结果
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )
print ("写入完成")

正在将随机森林模型拟合到带标签的训练数据上...
写入完成
