In [2]:
import pandas as pd

# 从文件中读取数据
train = pd.read_csv( "../tutorialData/labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )
test = pd.read_csv( "../tutorialData/testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "../tutorialData/unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

# 验证已读取的评论总数（共计 100,000 条）
print ("Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [3]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):
    # 将文档转换为单词序列的函数，
    # 可选择性地移除停用词。返回一个单词列表。
    #
    # 1. 移除HTML标签
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. 移除字母以外的字符
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. 将单词转换为小写并进行切分
    words = review_text.lower().split()
    #
    # 4. 可选择性地移除停用词（默认为不移除）
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. 返回单词列表
    return(words)

In [4]:
# 下载用于分句的 punkt 分词器
import nltk.data
nltk.download()   

# 加载 punkt 分词器
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# 定义一个函数，将评论拆分为经过解析的句子
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # 该函数将一篇评论拆分成已解析的句子。返回一个
    # 句子列表，其中每个句子又是一个单词列表。
    #
    # 1. 使用 NLTK 的分词器将段落拆分成句子
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. 遍历每个句子
    sentences = []
    for raw_sentence in raw_sentences:
        # 如果句子为空，则跳过
        if len(raw_sentence) > 0:
            # 否则，调用 review_to_wordlist 函数获取单词列表
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # 返回句子列表（每个句子是一个单词列表，
    # 因此，返回的是一个列表的列表）
    return sentences

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [8]:
import warnings
from bs4 import MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

sentences = []

print ("正在从训练集中解析句子")
sentences = []  

print ("正在从训练集中解析句子")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print ("正在从无标签数据集中解析句子")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

正在从训练集中解析句子
正在从训练集中解析句子
正在从无标签数据集中解析句子


In [14]:
print (len(sentences))

796172


In [15]:
print (sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [16]:
print (sentences[1])

['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [19]:
# 导入内置的 logging 模块并进行配置，以便 Word2Vec
# 输出美观的日志信息
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# 设置各种参数
num_features = 300    # 词向量维度
min_word_count = 40   # 最小词频
num_workers = 4       # 并行运行的线程数
context = 10          # 上下文窗口大小
downsampling = 1e-3   # 对高频词进行下采样的设置

# 初始化并训练模型（这会需要一些时间）
from gensim.models import word2vec
print ("正在训练模型...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# 如果你后续不再训练该模型，调用
# init_sims 会让模型占用更少的内存。
model.init_sims(replace=True)

# 创建一个有意义的模型名称并保存模型以便将来使用会很有帮助。
# 你之后可以使用 Word2Vec.load() 来加载它。
model_name = "300features_40minwords_10context"
model.save(model_name)

2025-10-13 20:47:22,558 : INFO : collecting all words and their counts
2025-10-13 20:47:22,560 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-10-13 20:47:22,603 : INFO : PROGRESS: at sentence #10000, processed 225664 words, keeping 17775 word types
2025-10-13 20:47:22,646 : INFO : PROGRESS: at sentence #20000, processed 451738 words, keeping 24945 word types
2025-10-13 20:47:22,688 : INFO : PROGRESS: at sentence #30000, processed 670858 words, keeping 30027 word types
2025-10-13 20:47:22,738 : INFO : PROGRESS: at sentence #40000, processed 896840 words, keeping 34335 word types


正在训练模型...


2025-10-13 20:47:22,781 : INFO : PROGRESS: at sentence #50000, processed 1116081 words, keeping 37751 word types
2025-10-13 20:47:22,824 : INFO : PROGRESS: at sentence #60000, processed 1337543 words, keeping 40711 word types
2025-10-13 20:47:22,865 : INFO : PROGRESS: at sentence #70000, processed 1560306 words, keeping 43311 word types
2025-10-13 20:47:22,905 : INFO : PROGRESS: at sentence #80000, processed 1779515 words, keeping 45707 word types
2025-10-13 20:47:22,942 : INFO : PROGRESS: at sentence #90000, processed 2003713 words, keeping 48121 word types
2025-10-13 20:47:22,983 : INFO : PROGRESS: at sentence #100000, processed 2225464 words, keeping 50190 word types
2025-10-13 20:47:23,022 : INFO : PROGRESS: at sentence #110000, processed 2444322 words, keeping 52058 word types
2025-10-13 20:47:23,062 : INFO : PROGRESS: at sentence #120000, processed 2666487 words, keeping 54098 word types
2025-10-13 20:47:23,103 : INFO : PROGRESS: at sentence #130000, processed 2892314 words, keep

In [21]:
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

In [22]:
model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [23]:
model.wv.doesnt_match("paris berlin london austria".split())

'austria'

In [24]:
model.wv.most_similar("man")

[('woman', 0.6098932027816772),
 ('lady', 0.5962560176849365),
 ('lad', 0.5753629207611084),
 ('monk', 0.5302724838256836),
 ('chap', 0.5291829109191895),
 ('men', 0.5222655534744263),
 ('millionaire', 0.5186971426010132),
 ('soldier', 0.5165213942527771),
 ('guy', 0.5154843330383301),
 ('farmer', 0.5146099925041199)]

In [25]:
model.wv.most_similar("queen")

[('princess', 0.673733115196228),
 ('bride', 0.6021547317504883),
 ('maid', 0.6014478206634521),
 ('victoria', 0.5923500061035156),
 ('prince', 0.5918601751327515),
 ('mistress', 0.5882285833358765),
 ('stepmother', 0.5788155198097229),
 ('showgirl', 0.57710200548172),
 ('goddess', 0.5735018253326416),
 ('belle', 0.5659166574478149)]

In [26]:
model.wv.most_similar("awful")

[('terrible', 0.7612918615341187),
 ('horrible', 0.717065691947937),
 ('atrocious', 0.7116694450378418),
 ('abysmal', 0.6962562203407288),
 ('dreadful', 0.6944637298583984),
 ('appalling', 0.6726535558700562),
 ('horrendous', 0.6554230451583862),
 ('lousy', 0.6256535053253174),
 ('horrid', 0.6230466365814209),
 ('laughable', 0.6056016683578491)]