#### IMDB影评数据，分析网站评论留言，判断每条留言的情感倾向。

In [2]:
import pandas as pd

In [3]:
train = pd.read_csv('./IMDB/labeledTrainData.tsv',delimiter='\t')
test  = pd.read_csv('./IMDB/testData.tsv',delimiter='\t')

In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [6]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [7]:
#完成原始评论的三项数据处理任务
def review_to_text(review,remove_stopwords):
    #1.去掉html标记
    raw_text = BeautifulSoup(review,'html').get_text()
    #2.去掉非字母字符
    letters = re.sub('[^a-zA-Z]',' ',raw_text)
    words = letters.lower().split()
    #3.去掉停用词
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
    return words

In [8]:
#对数据进行上面定义的函数的操作
X_train = []
for review in train['review']:
    X_train.append(' '.join(review_to_text(review,True)))
X_test = []
for review in test['review']:
    X_test.append(' '.join(review_to_text(review,True)))
Y_train = train['sentiment']



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [9]:
#导入文本特征抽取器,朴素贝叶斯模型、Pipeline、GridSearchCV、CountVectorizer、TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import MultinomialNB



In [10]:
#使用Pipeline搭建两个管道，一个用CountVectorizer文本提取器，一个用TfidfVectorizer文本提取器，用贝叶斯模型进行预测
pip_count_vect = Pipeline([('count_vec',CountVectorizer(analyzer='word')),('mnb',MultinomialNB())])
pip_tfidf_vect = Pipeline([('tfidf_vec',TfidfVectorizer(analyzer='word')),('mnb',MultinomialNB())])

In [11]:
#配置模型搜索的参数组合
params_count = {'count_vec__binary':[True,False],'count_vec__ngram_range':[(1,1),(1,2)],'mnb__alpha':[0.1,1.0,10]}
params_tfidf = {'tfidf_vec__binary':[True,False],'tfidf_vec__ngram_range':[(1,1),(1,2)],'mnb__alpha':[0.1,1.0,10]}

In [12]:
#使用五[这个地方为了快速使用2折]折交叉验证法对CountVectoerizer的朴素贝叶斯模型进行并行化超参数搜索
gs_count = GridSearchCV(pip_count_vect,params_count,cv=2,n_jobs=-1,verbose=1)
gs_count.fit(X_train,Y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.7min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preproc...nizer=None, vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'count_vec__binary': [True, False], 'count_vec__ngram_range': [(1, 1), (1, 2)], 'mnb__alpha': [0.1, 1.0, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [13]:
print(gs_count.best_params_)

{'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'mnb__alpha': 1.0}


In [14]:
print(gs_count.best_score_)

0.87788


In [15]:
#以最佳参数进行预测
gs_count_predict = gs_count.predict(X_test)

In [16]:
#构造DataFrame
gs_count_df = pd.DataFrame({"id":test['id'],'sentiment':gs_count_predict})
gs_count_df.to_csv('./IMDB/gs_count.csv')

In [17]:
gs_tfidf = GridSearchCV(pip_tfidf_vect,params_tfidf,cv=2,n_jobs=-1,verbose=1)
gs_tfidf.fit(X_train,Y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.0min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...rue,
        vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'tfidf_vec__binary': [True, False], 'tfidf_vec__ngram_range': [(1, 1), (1, 2)], 'mnb__alpha': [0.1, 1.0, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [18]:
print(gs_tfidf.best_params_)
print(gs_tfidf.best_score_)
gs_tfidf_predict = gs_tfidf.predict(X_test)

{'mnb__alpha': 1.0, 'tfidf_vec__binary': True, 'tfidf_vec__ngram_range': (1, 2)}
0.88164


In [19]:
#够着DataFrame
gs_tfidf_df = pd.DataFrame({'id':test['id'],'sentiment':gs_tfidf_predict})
gs_tfidf_df.to_csv('./IMDB/gs_tfidf.csv')

### 导入未标记的文本数据

In [20]:
#指定quoting为3，表示不受分隔符的限制
unlabeled_train = pd.read_csv('./IMDB/unlabeledTrainData.tsv',delimiter='\t',quoting=3)

In [21]:
import nltk.data

In [22]:
#使用nltk里面的tokenizer对英文句子进行分割
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [23]:
#定义函数，对影评数据分词
def review_to_sentences(review,tokenizer):
    raw_sentence = tokenizer.tokenize(review.strip())
    sentences = []
    for sentence  in raw_sentence:
        if len(sentence) > 0:
            sentences.append(review_to_text(sentence,False))
    return sentences


In [24]:
#语料库
corpora = []
for review in unlabeled_train['review']:
    corpora += review_to_sentences(review,tokenizer)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [25]:
#配置训练词向量的超参数
num_features = 300
min_word_count = 20
num_workers = 4
context = 10
downsampling = 1e-3

In [26]:
#从gensim.models导入word2vec
from gensim.models import Word2Vec
model = Word2Vec(corpora,workers=num_workers,size=num_features,min_count=min_word_count,window=context,sample=downsampling)



In [27]:
model.init_sims(replace=True)

In [28]:
model.save('./IMDB/word2vec_model')

In [29]:
load_model = Word2Vec.load('./IMDB/word2vec_model')
load_model.most_similar('man')

[('woman', 0.6181309223175049),
 ('lady', 0.6120635867118835),
 ('lad', 0.5911939144134521),
 ('chap', 0.5449566841125488),
 ('monk', 0.5360149145126343),
 ('person', 0.5311275720596313),
 ('guy', 0.5287982821464539),
 ('men', 0.5228261947631836),
 ('boy', 0.5050458908081055),
 ('soldier', 0.5033310651779175)]

In [30]:
import numpy as np
#用词向量产生文本特征向量
def makeFeatureVec(words,model,num_features):
    featureVec = np.zeros((num_features,),dtype=np.float32)
    nwords = 0
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords += 1
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [31]:
#将影评转换为基于词向量的特征向量
def getAvgFeatureVecs(reviews,model,num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype=np.float32)
    
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review,model,num_features)
        counter += 1
    return reviewFeatureVecs

In [32]:
#准备新的基于词向量表示的训练集和测试特征向量
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_to_text(review,True))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [33]:
trianDataVecs = getAvgFeatureVecs(clean_train_reviews,model,num_features)

In [None]:
clean_test_reviews=[]
for review in test['review']:
    clean_test_reviews.append(review_to_text(review,True))
testDataVecs = getAvgFeatureVecs(clean_test_reviews,model,num_features)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [None]:
#从sklearn.ensemble导入GradientBoostingClassifier模型进行影评情感分析
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
gbc = GradientBoostingClassifier()
params_gbc = {'n_estimators':[10,100,500],'learning_rate':[0.01,0.1,1.0],'max_depth':[2,3,4]}
gs = GridSearchCV(gbc,params_gbc,cv=2,n_jobs=-1,verbose=1)
gs.fit(trianDataVecs,Y_train)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


In [None]:
#输出网格搜索得到的最佳性能及最优参数组合
print(gs.best_score_)
print(gs.best_params_)

In [None]:
#使用超参数调优后的模型进行预测
result = gs.predict(testDataVecs)
output = pd.DataFrame({'id':test['id'],'sentiment':result})
output.to_csv('./IMDB/GradientBoostingClassifier_sentiment_analysis.csv')