In [19]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import pickle
from collections import defaultdict
import gensim
import nltk
from nltk.corpus import stopwords

# 1. 载入数据集

In [13]:
train = pd.read_csv(r'E:\OpenSourceDatasetCode\Dataset\Bag of Words Meets Bags of Popcorn\labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
test = pd.read_csv(r'E:\OpenSourceDatasetCode\Dataset\Bag of Words Meets Bags of Popcorn\testData.tsv', header=0, delimiter='\t', quoting=3)

In [15]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
id        25000 non-null object
review    25000 non-null object
dtypes: object(2)
memory usage: 390.7+ KB


# 2. 数据预处理

In [30]:
def bulid_train_test_data(train, test, train_ratio = 0.8):
    '''
    把IMDB的评论转成词序列
    '''
    reviews = []
    vocab = defaultdict(float)
    for i in range(train.shape[0]):
        label = train['sentiment'][i]
        # 去掉HTML标签，拿到内容
        review_text = BeautifulSoup(train['review'][i], "html.parser").get_text()
        # 用正则表达式取出符合规范的部分
        review_text = clean_str(review_text.strip())
        # 小写化所有的词，并转成词list
        review_text = review_text.lower().split()
        # 去除停用词
        stops = set(stopwords.words('english'))
        review_text = [w for w in review_text if not w in stops]
        words = set(review_text)
        for word in words:
            vocab[word] += 1
        sample = {'label': label,
                  'review_text': review_text,
                  'num_words': len(review_text),
                  'split': int(np.random.rand() < train_ratio)}
        reviews.append(sample)
        
    for i in range(test.shape[0]):
        # 去掉HTML标签，拿到内容
        review_text = BeautifulSoup(test['review'][i], "html.parser").get_text()
        # 用正则表达式取出符合规范的部分
        review_text = clean_str(review_text.strip())
        # 小写化所有的词，并转成词list
        review_text = review_text.lower().split()
        # 去除停用词
        stops = set(stopwords.words('english'))
        review_text = [w for w in review_text if not w in stops]
        words = set(review_text)
        for word in words:
            vocab[word] += 1
        sample = {'label': -1,
                  'review_text': review_text,
                  'num_words': len(review_text),
                  'split': -1}
        reviews.append(sample)
    # 返回words
    return reviews, vocab


def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()


In [31]:
reviews, vocab = bulid_train_test_data(train, test)
max_len = np.max(pd.DataFrame(reviews)['num_words'])
print('data loaded!')
print('number of sentences: ' + str(len(reviews)))
print('vocab size: ' + str(len(vocab)))
print('max sentence length: ' + str(max_len))

data loaded!
number of sentences: 50000
vocab size: 114639
max sentence length: 1571


In [32]:
def build_embedding_matrix(fname, vocab, k=300):
    """
    Loads 300x1 word vecs from Google (Mikolov) word2vec
    """
    word_idx_map = dict()
    model = gensim.models.Word2Vec.load_word2vec_format(fname, binary=True)
    print('word2vec loaded!')
    W = np.random.uniform(-0.25, 0.25, (len(vocab)+1, k))
    W[0] = np.zeros(k, dtype=np.float32)
    i = 1
    j = 0
    for word in vocab:
        if word in model.vocab:
            W[i] = model[word]
            j += 1
        
        word_idx_map[word] = i
        i += 1
    
    del model
    return W, word_idx_map, j

In [34]:
w2v_file = r'E:\ToolsData\Weights\GoogleNews-vectors-negative300.bin'
W, word_idx_map, num_in_model = build_embedding_matrix(w2v_file, vocab)
print('num words already in word2vec: ' + str(num_in_model))
with open('imdb_train_val_test.pkl', 'wb') as file:
    pickle.dump([reviews, W, word_idx_map, vocab], file)