In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from gensim.models.word2vec import Word2Vec

In [3]:
def load_data(fname):
    datas = {'labeledTrainData': 'labeledTrainData.tsv', 
            'unlabeledTrainData': 'unlabeledTrainData.tsv', 
            'testData': 'testData.tsv'
            }
    if fname not in datas:
        raise ValueError(fname)
    data_df = pd.read_csv(datas[fname], delimiter='\t', escapechar='\\')
    print('number of {}\'s reviews: {}\n'.format(fname, len(data_df)))
    return data_df

In [4]:
train_df = load_data('labeledTrainData')

unlabeled_train_df = load_data('unlabeledTrainData')

number of labeledTrainData's reviews: 25000

number of unlabeledTrainData's reviews: 50000



In [5]:
def review2word_list(review, remove_stopwords=False):
    # 1. remove html tag
    review_text = BeautifulSoup(review, 'html.parser').get_text()
    # 2. remove non-letters, convert to lower case words 
    word_list = re.sub('[^a-zA-Z]', ' ', review_text).lower().split()
    # 3. remove stopwords
    if remove_stopwords:
        stopwords_set = set(stopwords.words('english'))
        word_list = [w for w in words if w not in stopwords_set]
    # 4. return cleaned words list
    return word_list

In [8]:
def review2sent_list(review, remove_stopwords=False):
    # 1. split the paragraph of review into sentences
    sents = sent_tokenize(review.strip())
    # 2. loop over each sentence
    sent_list = [] # format: [sent1, sent2, ...]
    for sent in sents:
        if len(sent) > 0:
            sent_list.append(review2word_list(sent, remove_stopwords))
    # return sentence list of the paragraph
    return sent_list

In [9]:
# sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
review_list = [] # format: [[review1], [review2], ...]
print('parsing reviews from training data: \n')
for review in train_df.review:
    review_list += review2sent_list(review)

print('parsing reviews from unlabeled data: \n')
for review in unlabeled_train_df.review:
    review_list += review2sent_list(review)

parsing reviews from training data: 



  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


parsing reviews from unlabeled data: 



  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [10]:
print('len of reviews: ', len(review_list))
print('\nthe first review: \n', review_list[0])
print('\nthe second review: \n', review_list[1])

len of reviews:  808704

the first review: 
 ['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']

the second review: 
 ['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [11]:
## training a word2vec model

print('training model')
model = Word2Vec(review_list, size=300, window=10, min_count=40, workers=4)
model.save('300features-10context-40minwords')

training model


In [12]:
model.most_similar('man')

[('woman', 0.6067818403244019),
 ('lady', 0.5969487428665161),
 ('lad', 0.5700379610061646),
 ('guy', 0.5343689918518066),
 ('monk', 0.5282979607582092),
 ('men', 0.5258927345275879),
 ('farmer', 0.5221534371376038),
 ('soldier', 0.5164685845375061),
 ('priest', 0.515847384929657),
 ('person', 0.5158398151397705)]

In [13]:
model.most_similar('queen')

[('princess', 0.6676959991455078),
 ('bride', 0.6202530264854431),
 ('stepmother', 0.6112695932388306),
 ('goddess', 0.5895574688911438),
 ('seductress', 0.5838366150856018),
 ('eva', 0.5808178186416626),
 ('mistress', 0.57389235496521),
 ('maid', 0.5654277801513672),
 ('prince', 0.5641684532165527),
 ('hunchback', 0.5631351470947266)]

In [14]:
model.most_similar('awful')

[('terrible', 0.7620289325714111),
 ('atrocious', 0.744351327419281),
 ('dreadful', 0.7296530604362488),
 ('horrible', 0.7197510004043579),
 ('abysmal', 0.6975699067115784),
 ('horrendous', 0.6828667521476746),
 ('appalling', 0.677104115486145),
 ('horrid', 0.6722877621650696),
 ('lousy', 0.6218140125274658),
 ('amateurish', 0.6053661108016968)]

In [16]:
model.most_similar('computer')

[('cgi', 0.5957990884780884),
 ('software', 0.595744252204895),
 ('generated', 0.5826317071914673),
 ('technology', 0.5680620074272156),
 ('computers', 0.5613633990287781),
 ('graphics', 0.5320250988006592),
 ('monitor', 0.5284072160720825),
 ('cg', 0.5250223278999329),
 ('digital', 0.5223430395126343),
 ('laser', 0.5146235227584839)]