In [1]:
import pandas as pd
import sklearn
from bs4 import BeautifulSoup
import re
import nltk.data
from nltk.corpus import stopwords

In [2]:
train = pd.read_csv( "./data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "./data/testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "./data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

In [3]:
# 定义与处理函数
def review_to_wordlist(review, remove_stopwords=False):
    # 去掉 html
    review_text = BeautifulSoup(review, "html5lib").get_text()
    # 去掉 none letter
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # 转换大小写并分割
    words = letters_only.lower().split()
    # stop_words
    # 删除 stop_words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    return words

In [4]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [5]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    
    return sentences

In [6]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [7]:
sentences[0]

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again']

In [8]:
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)

num_features = 300
context = 10 # window_size
downsampling = 1e-3
min_word_count = 40
num_workers = 4

In [9]:
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(
    sentences,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context,
    sample=downsampling
)

model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-08-30 14:37:36,861: INFO: 'pattern' package not found; tag filters are not available for English
2017-08-30 14:37:36,868: INFO: collecting all words and their counts
2017-08-30 14:37:36,869: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-30 14:37:36,937: INFO: PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2017-08-30 14:37:37,010: INFO: PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types


Training model...


2017-08-30 14:37:37,077: INFO: PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2017-08-30 14:37:37,148: INFO: PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2017-08-30 14:37:37,216: INFO: PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2017-08-30 14:37:37,286: INFO: PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2017-08-30 14:37:37,361: INFO: PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2017-08-30 14:37:37,436: INFO: PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2017-08-30 14:37:37,526: INFO: PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2017-08-30 14:37:37,609: INFO: PROGRESS: at sentence #100000, processed 2226967 words, keeping 50207 word types
2017-08-30 14:37:37,673: INFO: PROGRESS: at sentence #110000, processed 2446581 words, keeping 52081 word types
2

2017-08-30 14:37:41,815: INFO: PROGRESS: at sentence #760000, processed 16990810 words, keeping 120930 word types
2017-08-30 14:37:41,878: INFO: PROGRESS: at sentence #770000, processed 17217947 words, keeping 121703 word types
2017-08-30 14:37:41,942: INFO: PROGRESS: at sentence #780000, processed 17448093 words, keeping 122402 word types
2017-08-30 14:37:42,004: INFO: PROGRESS: at sentence #790000, processed 17675169 words, keeping 123066 word types
2017-08-30 14:37:42,041: INFO: collected 123504 word types from a corpus of 17798270 raw words and 795538 sentences
2017-08-30 14:37:42,042: INFO: Loading a fresh vocabulary
2017-08-30 14:37:42,115: INFO: min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)
2017-08-30 14:37:42,115: INFO: min_count=40 leaves 17239125 word corpus (96% of original 17798270, drops 559145)
2017-08-30 14:37:42,164: INFO: deleting the raw counts dictionary of 123504 items
2017-08-30 14:37:42,168: INFO: sample=0.001 downsamples 48 most-c

2017-08-30 14:38:53,202: INFO: PROGRESS: at 29.01% examples, 261236 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:54,220: INFO: PROGRESS: at 29.39% examples, 260909 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:55,287: INFO: PROGRESS: at 29.75% examples, 260326 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:38:56,327: INFO: PROGRESS: at 30.16% examples, 260146 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:57,384: INFO: PROGRESS: at 30.61% examples, 260283 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:38:58,439: INFO: PROGRESS: at 30.97% examples, 259774 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:59,473: INFO: PROGRESS: at 31.37% examples, 259530 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:00,481: INFO: PROGRESS: at 31.76% examples, 259479 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:01,495: INFO: PROGRESS: at 32.15% examples, 259314 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:02,523: INFO: PROGRESS: at 32.51% examples, 258842 words/s, in_qsize 7, ou

2017-08-30 14:40:17,720: INFO: PROGRESS: at 61.43% examples, 252269 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:18,728: INFO: PROGRESS: at 61.78% examples, 252061 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:19,737: INFO: PROGRESS: at 62.20% examples, 252127 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:20,746: INFO: PROGRESS: at 62.53% examples, 251832 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:21,788: INFO: PROGRESS: at 62.93% examples, 251803 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:22,840: INFO: PROGRESS: at 63.26% examples, 251446 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:23,893: INFO: PROGRESS: at 63.66% examples, 251353 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:24,908: INFO: PROGRESS: at 64.08% examples, 251411 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:25,910: INFO: PROGRESS: at 64.53% examples, 251621 words/s, in_qsize 6, out_qsize 1
2017-08-30 14:40:26,915: INFO: PROGRESS: at 64.99% examples, 251866 words/s, in_qsize 7, ou

2017-08-30 14:41:41,514: INFO: PROGRESS: at 95.95% examples, 255887 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:42,516: INFO: PROGRESS: at 96.24% examples, 255594 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:43,536: INFO: PROGRESS: at 96.62% examples, 255520 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:44,542: INFO: PROGRESS: at 96.97% examples, 255376 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:45,547: INFO: PROGRESS: at 97.37% examples, 255378 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:46,548: INFO: PROGRESS: at 97.74% examples, 255297 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:47,549: INFO: PROGRESS: at 98.10% examples, 255189 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:48,553: INFO: PROGRESS: at 98.44% examples, 255048 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:49,560: INFO: PROGRESS: at 98.90% examples, 255165 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:50,584: INFO: PROGRESS: at 99.25% examples, 255007 words/s, in_qsize 7, ou

In [10]:
# 检查训练结果
print(model.doesnt_match("apple pepole banana orange".split()))
print(model.most_similar("sex"))
print(model.most_similar("bitch"))



orange
[('sexual', 0.6434853076934814), ('masturbation', 0.5880445241928101), ('lesbian', 0.5711398124694824), ('nudity', 0.5502188205718994), ('nude', 0.544196605682373), ('gratuitous', 0.5305460691452026), ('rape', 0.5145243406295776), ('incest', 0.5125550031661987), ('lovemaking', 0.5069805383682251), ('explicit', 0.5014914274215698)]
[('slut', 0.6980215907096863), ('whore', 0.6937853693962097), ('blonde', 0.6554198265075684), ('bimbo', 0.652977466583252), ('stripper', 0.6326565742492676), ('perky', 0.6208070516586304), ('housewife', 0.6149146556854248), ('maid', 0.6066554188728333), ('bitchy', 0.6030336618423462), ('mona', 0.6029400825500488)]


In [12]:
# 载入训练好的模型
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")

2017-08-30 14:43:35,341: INFO: loading Word2Vec object from 300features_40minwords_10context
2017-08-30 14:43:35,637: INFO: loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2017-08-30 14:43:35,638: INFO: setting ignored attribute syn0norm to None
2017-08-30 14:43:35,639: INFO: setting ignored attribute cum_table to None
2017-08-30 14:43:35,639: INFO: loaded 300features_40minwords_10context


In [14]:
model["flower"].shape

(300,)

In [19]:
# import numpy as np

# def makeFeatureVec(words, model, num_features):
#     featureVec = np.zeros((num_features,), dtype="float32")
#     nwords = 0
#     index2word_set = set(model.index2word)
    
#     for word in words:
#         if word in index2word_set:
#             nwords = nwords + 1.
#             featureVec = np.add(featureVec, model[word])
            
#     featureVec = np.divide(featureVec, nwords)
#     return featureVec

# def getAvgFeatureVecs(reviews, model, num_features):
#     counter = 0.
#     reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
#     for review in reviews:
#         if counter%5000. == 0.:
#             print("Review {} of {}".format(counter, len(reviews)))
        
#         reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        
#         counter += 1
        
#     return reviewFeatureVecs

In [20]:
# clean_train_reviews = []
# for review in train["review"]:
#     clean_train_reviews.append( review_to_wordlist( review, \
#         remove_stopwords=True ))

# trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

# print("Creating average feature vecs for test reviews")
# clean_test_reviews = []
# for review in test["review"]:
#     clean_test_reviews.append( review_to_wordlist( review, \
#         remove_stopwords=True ))

# testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

In [21]:
# # Fit a random forest to the training data, using 100 trees
# from sklearn.ensemble import RandomForestClassifier
# forest = RandomForestClassifier( n_estimators = 100 )

# print "Fitting a random forest to labeled training data..."
# forest = forest.fit( trainDataVecs, train["sentiment"] )

# # Test & extract results 
# result = forest.predict( testDataVecs )

# # Write the test results 
# output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
# output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )