# Part 1

用labeled_train_data演示Doc2vec的训练

In [1]:
import sys
sys.path.insert(0, '..')
import os

import numpy as np
import pandas as pd

In [2]:
import zipfile
with zipfile.ZipFile('./data/labeledTrainData.tsv.zip', 'r') as z:
    z.extractall('./data/')

In [3]:
labeled_train = pd.read_csv('./data/labeledTrainData.tsv', sep='\t')
print(labeled_train.shape)
labeled_train.head(5)

(25000, 3)


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
labeled_train['review'][0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [5]:
from bs4 import BeautifulSoup
import re

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets 
    Every dataset is lower cased
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()

labeled_texts = []
labels = []
for idx in range(labeled_train.review.shape[0]):
    labeled_texts.append(clean_str(BeautifulSoup(labeled_train.review[idx], "lxml").get_text()))
    labels.append(labeled_train.sentiment[idx])

In [6]:
labeled_texts[0]

"with all this stuff going down at the moment with mj i 've started listening to his music , watching the odd documentary here and there , watched the wiz and watched moonwalker again maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent moonwalker is part biography , part feature film which i remember going to see at the cinema when it was originally released some of it has subtle messages about mj 's feeling towards the press and also the obvious message of drugs are bad m'kay visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him the actual feature film bit when it finally starts is only on for 20 m

### Doc2vec的训练             

In [10]:
import codecs
from datetime import datetime
# -------------------add row number to query----------------------
doc_f = codecs.open('./data/' + 'labeled_train_id.txt','w',encoding='utf8')
for i, reviews in enumerate(labeled_texts):
    tags = [i]
    if i % 10000 == 0:
        print(datetime.now(),i)
    doc_f.write('_*{} {}\n'.format(i,reviews))
doc_f.close()

2018-10-01 15:42:00.135134 0
2018-10-01 15:42:00.177021 10000
2018-10-01 15:42:00.217945 20000


In [11]:
doc_f = codecs.open('./data/' + 'test.txt','w',encoding='utf8')
for i, reviews in enumerate(labeled_texts[:5]):
    tags = [i]
    doc_f.write('_*{} {}\n'.format(i,reviews))
doc_f.close()

In [12]:
from collections import namedtuple

SentimentDocument = namedtuple('SentimentDocument', 'words tags')
class Doc_list(object):
    def __init__(self,f):
        self.f = f
    def __iter__(self):
        for i,line in enumerate(codecs.open(self.f,encoding='utf8')):
            words = line.split()
            tags = [int(words[0][2:])]
            words = words[1:]
            yield SentimentDocument(words,tags)

In [14]:
from gensim.models.doc2vec import Doc2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

d2v = Doc2Vec(dm=0, size=300, negative=5, hs=0, min_count=3, window=30,sample=1e-5,
              workers=6,alpha=0.025,min_alpha=0.025,epochs=1)
doc_list = Doc_list('./data/' + 'labeled_train_id.txt')
d2v.build_vocab(doc_list)

# -------------------train dbow doc2vec---------------------------------------------
for i in range(50):
    print(datetime.now(),'pass:',i + 1)
    doc_list = Doc_list('./data/' + 'labeled_train_id.txt')
    d2v.train(doc_list,total_examples=d2v.corpus_count, epochs=d2v.epochs)
    X_d2v = np.array([d2v.docvecs[i] for i in range(25000)])
    scores = cross_val_score(LogisticRegression(C=3),X_d2v,labels,cv=5,n_jobs=4)
    print('dbow',scores,np.mean(scores))
d2v.save('./data/' + 'dbow_d2v.model')
print(datetime.now(),'save done')



2018-10-01 14:02:33.488091 pass: 1
dbow [0.5026 0.5096 0.5252 0.546  0.53  ] 0.52268
2018-10-01 14:02:39.072662 pass: 2
dbow [0.5188 0.5254 0.5256 0.532  0.5332] 0.5269999999999999
2018-10-01 14:02:44.568629 pass: 3
dbow [0.548  0.5422 0.5562 0.566  0.5614] 0.55476
2018-10-01 14:02:50.270080 pass: 4
dbow [0.602  0.6094 0.6334 0.6324 0.6294] 0.62132
2018-10-01 14:02:56.033885 pass: 5
dbow [0.6718 0.6656 0.6752 0.6702 0.667 ] 0.66996
2018-10-01 14:03:02.088532 pass: 6
dbow [0.7084 0.707  0.714  0.719  0.6892] 0.7075199999999999
2018-10-01 14:03:08.419933 pass: 7
dbow [0.7406 0.7508 0.7472 0.75   0.7376] 0.74524
2018-10-01 14:03:15.594719 pass: 8
dbow [0.7734 0.777  0.7772 0.7868 0.774 ] 0.7776799999999999
2018-10-01 14:03:23.463741 pass: 9
dbow [0.7892 0.7936 0.8004 0.8154 0.7916] 0.79804
2018-10-01 14:03:32.212353 pass: 10
dbow [0.8016 0.8144 0.811  0.8312 0.809 ] 0.8134399999999999
2018-10-01 14:03:41.502588 pass: 11
dbow [0.8182 0.8252 0.825  0.8384 0.8244] 0.82624
2018-10-01 14:03:51

In [15]:
d2v = Doc2Vec(dm=1, size=300, negative=5, hs=0, min_count=3, window=10,sample=1e-5,
              workers=6,alpha=0.05,min_alpha=0.025,epochs=1)
doc_list = Doc_list('./data/' + 'labeled_train_id.txt')
d2v.build_vocab(doc_list)

# -------------------train dm doc2vec---------------------------------------------
for i in range(50):
    print(datetime.now(),'pass:',i + 1)
    doc_list = Doc_list('./data/' + 'labeled_train_id.txt')
    d2v.train(doc_list,total_examples=d2v.corpus_count, epochs=d2v.epochs)
    X_d2v = np.array([d2v.docvecs[i] for i in range(25000)])
    scores = cross_val_score(LogisticRegression(C=3),X_d2v,labels,cv=5,n_jobs=4)
    print('dbow',scores,np.mean(scores))
d2v.save('./data/' + 'dm_d2v.model')
print(datetime.now(),'save done')



2018-10-01 14:14:06.046697 pass: 1
dbow [0.5036 0.5104 0.5452 0.5664 0.5684] 0.5388000000000001
2018-10-01 14:14:12.480981 pass: 2
dbow [0.5512 0.6006 0.6178 0.6248 0.6232] 0.60352
2018-10-01 14:14:18.932892 pass: 3
dbow [0.6474 0.6664 0.6898 0.696  0.6874] 0.6774000000000001
2018-10-01 14:14:26.339173 pass: 4
dbow [0.7266 0.7394 0.7562 0.7614 0.7478] 0.7462799999999999
2018-10-01 14:14:34.815545 pass: 5
dbow [0.7714 0.786  0.7802 0.7866 0.7716] 0.77916
2018-10-01 14:14:43.397165 pass: 6
dbow [0.7938 0.809  0.809  0.8114 0.8046] 0.80556
2018-10-01 14:14:53.505202 pass: 7
dbow [0.8226 0.817  0.8256 0.8238 0.8282] 0.82344
2018-10-01 14:15:04.129360 pass: 8
dbow [0.834  0.8326 0.8264 0.8298 0.8344] 0.83144
2018-10-01 14:15:15.649132 pass: 9
dbow [0.8454 0.8374 0.834  0.8416 0.839 ] 0.83948
2018-10-01 14:15:27.403376 pass: 10
dbow [0.8502 0.843  0.8398 0.842  0.8406] 0.8431200000000001
2018-10-01 14:15:40.155957 pass: 11
dbow [0.8534 0.851  0.8412 0.8496 0.8456] 0.84816
2018-10-01 14:15:53

# Part 2

用labeld_train_data和unlabeled_train_data进行Doc2vec的训练

In [24]:
train = pd.read_csv( './data/labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
test = pd.read_csv( './data/testData.tsv', header=0, delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv( './data/unlabeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [22]:
print(train.shape)
train.head()

(25000, 3)


Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [16]:
print(test.shape)
test.head()

(25000, 2)


Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [17]:
print(unlabeled_train.shape)
unlabeled_train.head()

(50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [20]:
data_train = pd.concat([train,unlabeled_train]).fillna(0)
data_train.to_csv('./data/data_train.csv',index=None,encoding='utf8')

In [28]:
data_train = pd.read_csv('./data/data_train.csv', encoding='utf-8')

In [41]:
train_texts = []
for idx in range(data_train.review.shape[0]):
    train_texts.append(clean_str(BeautifulSoup(data_train.review[idx], "lxml").get_text()))

In [42]:
train_texts[0]

"with all this stuff going down at the moment with mj i 've started listening to his music , watching the odd documentary here and there , watched the wiz and watched moonwalker again maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent moonwalker is part biography , part feature film which i remember going to see at the cinema when it was originally released some of it has subtle messages about mj 's feeling towards the press and also the obvious message of drugs are bad m'kay visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him the actual feature film bit when it finally starts is only on for 20 m

In [43]:
import codecs
from datetime import datetime
# -------------------add row number to query----------------------
doc_f = codecs.open('./data/' + 'data_train_id.txt','w',encoding='utf8')
for i, reviews in enumerate(train_texts):
    tags = [i]
    if i % 10000 == 0:
        print(datetime.now(),i)
    doc_f.write('_*{} {}\n'.format(i,reviews))
doc_f.close()

2018-10-01 16:08:07.578801 0
2018-10-01 16:08:07.620714 10000
2018-10-01 16:08:07.659620 20000
2018-10-01 16:08:07.699507 30000
2018-10-01 16:08:07.739400 40000
2018-10-01 16:08:07.778268 50000
2018-10-01 16:08:07.821178 60000
2018-10-01 16:08:07.860050 70000


In [9]:
from collections import namedtuple

SentimentDocument = namedtuple('SentimentDocument', 'words tags')
class Doc_list(object):
    def __init__(self,f):
        self.f = f
    def __iter__(self):
        for i,line in enumerate(codecs.open(self.f,encoding='utf8')):
            words = line.split()
            tags = [int(words[0][2:])]
            words = words[1:]
            yield SentimentDocument(words,tags)

In [45]:
from gensim.models.doc2vec import Doc2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

d2v = Doc2Vec(dm=0, size=300, negative=5, hs=0, min_count=3, window=30,sample=1e-5,
              workers=6,alpha=0.025,min_alpha=0.025,epochs=1)
doc_list = Doc_list('./data/' + 'data_train_id.txt')
d2v.build_vocab(doc_list)

# -------------------train dbow doc2vec---------------------------------------------
for i in range(50):
    print(datetime.now(),'pass:',i + 1)
    doc_list = Doc_list('./data/' + 'data_train_id.txt')
    d2v.train(doc_list,total_examples=d2v.corpus_count, epochs=d2v.epochs)
    X_d2v = np.array([d2v.docvecs[i] for i in range(25000)])
    scores = cross_val_score(LogisticRegression(C=3),X_d2v,labels,cv=5,n_jobs=4)
    print('dbow',scores,np.mean(scores))
d2v.save('./data/' + 'dbow_d2v_2.model')
print(datetime.now(),'save done')



2018-10-01 16:10:20.332392 pass: 1
dbow [0.5072 0.5086 0.536  0.5364 0.54  ] 0.52564
2018-10-01 16:10:34.063216 pass: 2
dbow [0.5254 0.5406 0.5534 0.5476 0.5322] 0.53984
2018-10-01 16:10:47.913431 pass: 3
dbow [0.6102 0.6186 0.6352 0.6428 0.6246] 0.6262800000000001
2018-10-01 16:11:01.440077 pass: 4
dbow [0.6766 0.6762 0.6744 0.6836 0.663 ] 0.67476
2018-10-01 16:11:15.808891 pass: 5
dbow [0.7278 0.7362 0.7368 0.7376 0.7182] 0.73132
2018-10-01 16:11:30.976330 pass: 6
dbow [0.7726 0.782  0.7904 0.7858 0.7794] 0.78204
2018-10-01 16:11:47.484707 pass: 7
dbow [0.8024 0.8106 0.8138 0.825  0.8062] 0.8116
2018-10-01 16:12:04.575526 pass: 8
dbow [0.8204 0.8252 0.826  0.8326 0.8204] 0.82492
2018-10-01 16:12:22.721560 pass: 9
dbow [0.8388 0.8386 0.8374 0.8494 0.8364] 0.8401200000000001
2018-10-01 16:12:41.326036 pass: 10
dbow [0.8522 0.8472 0.8436 0.8648 0.8486] 0.85128
2018-10-01 16:12:59.494171 pass: 11
dbow [0.8554 0.859  0.8512 0.8672 0.856 ] 0.8577600000000001
2018-10-01 16:13:17.179199 pass

In [46]:
d2v = Doc2Vec(dm=1, size=300, negative=5, hs=0, min_count=3, window=10,sample=1e-5,
              workers=6,alpha=0.05,min_alpha=0.025,epochs=1)
doc_list = Doc_list('./data/' + 'data_train_id.txt')
d2v.build_vocab(doc_list)

# -------------------train dm doc2vec---------------------------------------------
for i in range(50):
    print(datetime.now(),'pass:',i + 1)
    doc_list = Doc_list('./data/' + 'data_train_id.txt')
    d2v.train(doc_list,total_examples=d2v.corpus_count, epochs=d2v.epochs)
    X_d2v = np.array([d2v.docvecs[i] for i in range(25000)])
    scores = cross_val_score(LogisticRegression(C=3),X_d2v,labels,cv=5,n_jobs=4)
    print('dbow',scores,np.mean(scores))
d2v.save('./data/' + 'dm_d2v_2.model')
print(datetime.now(),'save done')



2018-10-01 16:26:06.380045 pass: 1
dbow [0.5012 0.5108 0.5348 0.5692 0.5716] 0.53752
2018-10-01 16:26:21.271950 pass: 2
dbow [0.687  0.7096 0.7366 0.742  0.7192] 0.71888
2018-10-01 16:26:37.673938 pass: 3
dbow [0.7616 0.7714 0.7792 0.7804 0.7806] 0.77464
2018-10-01 16:26:55.519131 pass: 4
dbow [0.7952 0.8038 0.802  0.8002 0.8014] 0.80052
2018-10-01 16:27:14.858856 pass: 5
dbow [0.8188 0.8248 0.8182 0.8238 0.8166] 0.82044
2018-10-01 16:27:34.357457 pass: 6
dbow [0.8326 0.8354 0.827  0.8336 0.8272] 0.83116
2018-10-01 16:27:55.431112 pass: 7
dbow [0.8388 0.8426 0.8374 0.8358 0.8348] 0.83788
2018-10-01 16:28:15.609199 pass: 8
dbow [0.8456 0.8452 0.8426 0.8426 0.8306] 0.84132
2018-10-01 16:28:35.812930 pass: 9
dbow [0.8496 0.8466 0.842  0.8438 0.8334] 0.8430799999999999
2018-10-01 16:28:55.118947 pass: 10
dbow [0.8486 0.8478 0.8422 0.8468 0.8396] 0.8450000000000001
2018-10-01 16:29:12.356837 pass: 11
dbow [0.849  0.8504 0.8436 0.8484 0.8458] 0.84744
2018-10-01 16:29:29.181745 pass: 12
dbow 

In [47]:
d2v = Doc2Vec(dm=1, size=400, negative=5, hs=0, min_count=3, window=10,sample=1e-5,
              workers=6,alpha=0.05,min_alpha=0.025,epochs=1)
doc_list = Doc_list('./data/' + 'data_train_id.txt')
d2v.build_vocab(doc_list)

# -------------------train dm doc2vec---------------------------------------------
for i in range(50):
    print(datetime.now(),'pass:',i + 1)
    doc_list = Doc_list('./data/' + 'data_train_id.txt')
    d2v.train(doc_list,total_examples=d2v.corpus_count, epochs=d2v.epochs)
    X_d2v = np.array([d2v.docvecs[i] for i in range(25000)])
    scores = cross_val_score(LogisticRegression(C=3),X_d2v,labels,cv=5,n_jobs=4)
    print('dbow',scores,np.mean(scores))
d2v.save('./data/' + 'dm_d2v_3.model')
print(datetime.now(),'save done')



2018-10-01 16:43:16.771053 pass: 1
dbow [0.515  0.5092 0.5426 0.564  0.5674] 0.53964
2018-10-01 16:43:32.768131 pass: 2
dbow [0.6874 0.7098 0.7244 0.7334 0.719 ] 0.7148
2018-10-01 16:43:50.502366 pass: 3
dbow [0.757  0.7838 0.7706 0.78   0.7772] 0.7737200000000001
2018-10-01 16:44:09.880086 pass: 4
dbow [0.801  0.804  0.8006 0.812  0.7996] 0.8034399999999999
2018-10-01 16:44:30.825801 pass: 5
dbow [0.8134 0.8302 0.8098 0.8206 0.8172] 0.81824
2018-10-01 16:44:53.859074 pass: 6
dbow [0.8296 0.8424 0.8222 0.8298 0.8262] 0.83004
2018-10-01 16:45:16.009211 pass: 7
dbow [0.8372 0.8458 0.8304 0.8348 0.829 ] 0.83544
2018-10-01 16:45:40.372005 pass: 8
dbow [0.8376 0.8476 0.8346 0.8404 0.8344] 0.8389199999999999
2018-10-01 16:46:05.600841 pass: 9
dbow [0.8446 0.8444 0.8326 0.8416 0.8382] 0.8402800000000001
2018-10-01 16:46:29.385896 pass: 10
dbow [0.8452 0.8474 0.8396 0.8418 0.8404] 0.8428800000000001
2018-10-01 16:46:51.753766 pass: 11
dbow [0.8482 0.8498 0.8382 0.8412 0.8442] 0.84432
2018-10-0

In [12]:
import codecs
from datetime import datetime
from gensim.models.doc2vec import Doc2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

d2v = Doc2Vec(dm=1, dm_concat=1, size=400, negative=5, hs=0, min_count=3, window=10, sample=1e-5,
              workers=6, alpha=0.05, min_alpha=0.025, epochs=1)
doc_list = Doc_list('./data/' + 'data_train_id.txt')
d2v.build_vocab(doc_list)

# -------------------train dm doc2vec---------------------------------------------
for i in range(2):
    print(datetime.now(),'pass:',i + 1)
    doc_list = Doc_list('./data/' + 'data_train_id.txt')
    d2v.train(doc_list,total_examples=d2v.corpus_count, epochs=d2v.epochs)
    X_d2v = np.array([d2v.docvecs[i] for i in range(25000)])
    scores = cross_val_score(LogisticRegression(C=3),X_d2v,labels,cv=5,n_jobs=4)
    print('dbow',scores,np.mean(scores))
d2v.save('./data/' + 'dm_d2v_4.model')
print(datetime.now(),'save done')



2018-10-01 17:50:51.895676 pass: 1
dbow [0.5164 0.5164 0.5306 0.5366 0.5196] 0.5239199999999999
2018-10-01 18:00:03.867105 pass: 2
dbow [0.5296 0.542  0.5456 0.5528 0.5382] 0.5416399999999999
2018-10-01 18:10:13.599449 save done


## Dbow-nn

In [17]:
len(pd.value_counts(labels))

2

In [20]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cross_validation import KFold
from gensim.models import Doc2Vec
from collections import OrderedDict

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
import re

# ----------------------- myfunc -----------------------
def myAcc(y_true,y_pred):
    y_pred = np.argmax(y_pred,axis=1)
    return np.mean(y_true == y_pred)

# ----------------------- load dataset -----------------
labeled_train = pd.read_csv('./data/labeledTrainData.tsv', sep='\t', encoding='utf8')
model = Doc2Vec.load('./data/dbow_d2v.model')
X_sp = np.array([model.docvecs[i] for i in range(25000)])
labels = labeled_train['sentiment']

# ----------------------dbowd2v stack -------------------
df_stack = pd.DataFrame(index=range(len(labeled_train)))
TR = 20000
n = 5

X = X_sp[:TR]
y = labels[:TR]
X_te = X_sp[TR:]
y_te = labels[TR:]

num_class = len(pd.value_counts(labels))
stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))

for k,(tr,va) in enumerate(KFold(len(y),n_folds=n)):
    print('{} stack:{}/{}'.format(datetime.now(), k+1, n))
    nb_classes = num_class
    X_train = X[tr]
    y_train = y[tr]
    X_test = X_te
    y_test = y_te
    
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    model = Sequential()
    model.add(Dense(300,input_shape=(X_train.shape[1],)))
    model.add(Dropout(0.1))
    model.add(Activation('tanh'))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                      optimizer='adadelta',
                      metrics=['accuracy'])

    history = model.fit(X_train, Y_train,shuffle=True,
                            batch_size=128, nb_epoch=35,
                            verbose=2, validation_data=(X_test, Y_test))
    y_pred_va = model.predict_proba(X[va])
    y_pred_te = model.predict_proba(X_te)
    print('va acc:',myAcc(y[va],y_pred_va))
    print('te acc:',myAcc(y_te,y_pred_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for l in range(stack_all.shape[1]):
    df_stack['dbowd2v_{}'.format(l)] = stack_all[:,l]
        
df_stack.to_csv('./data/dbowd2v_stack.csv',encoding='utf8',index=None)
print(datetime.now(),'save dbowd2v stack done!')        

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


2018-10-01 23:03:11.215836 stack:1/5




Train on 16000 samples, validate on 5000 samples
Epoch 1/35
 - 3s - loss: 0.3370 - acc: 0.8564 - val_loss: 0.3113 - val_acc: 0.8660
Epoch 2/35
 - 0s - loss: 0.2902 - acc: 0.8782 - val_loss: 0.3125 - val_acc: 0.8692
Epoch 3/35
 - 0s - loss: 0.2865 - acc: 0.8824 - val_loss: 0.3000 - val_acc: 0.8754
Epoch 4/35
 - 0s - loss: 0.2836 - acc: 0.8811 - val_loss: 0.2958 - val_acc: 0.8772
Epoch 5/35
 - 0s - loss: 0.2812 - acc: 0.8817 - val_loss: 0.2953 - val_acc: 0.8766
Epoch 6/35
 - 0s - loss: 0.2775 - acc: 0.8834 - val_loss: 0.3022 - val_acc: 0.8726
Epoch 7/35
 - 0s - loss: 0.2775 - acc: 0.8851 - val_loss: 0.2967 - val_acc: 0.8764
Epoch 8/35
 - 0s - loss: 0.2755 - acc: 0.8859 - val_loss: 0.2962 - val_acc: 0.8762
Epoch 9/35
 - 0s - loss: 0.2736 - acc: 0.8859 - val_loss: 0.2979 - val_acc: 0.8760
Epoch 10/35
 - 0s - loss: 0.2724 - acc: 0.8855 - val_loss: 0.2954 - val_acc: 0.8754
Epoch 11/35
 - 0s - loss: 0.2698 - acc: 0.8862 - val_loss: 0.2953 - val_acc: 0.8754
Epoch 12/35
 - 0s - loss: 0.2671 - a

Epoch 26/35
 - 0s - loss: 0.2093 - acc: 0.9176 - val_loss: 0.2961 - val_acc: 0.8760
Epoch 27/35
 - 0s - loss: 0.2059 - acc: 0.9207 - val_loss: 0.2942 - val_acc: 0.8794
Epoch 28/35
 - 0s - loss: 0.1988 - acc: 0.9223 - val_loss: 0.2999 - val_acc: 0.8766
Epoch 29/35
 - 0s - loss: 0.1962 - acc: 0.9232 - val_loss: 0.2986 - val_acc: 0.8804
Epoch 30/35
 - 0s - loss: 0.1911 - acc: 0.9266 - val_loss: 0.3110 - val_acc: 0.8732
Epoch 31/35
 - 0s - loss: 0.1838 - acc: 0.9287 - val_loss: 0.3126 - val_acc: 0.8750
Epoch 32/35
 - 0s - loss: 0.1763 - acc: 0.9342 - val_loss: 0.3043 - val_acc: 0.8798
Epoch 33/35
 - 0s - loss: 0.1730 - acc: 0.9329 - val_loss: 0.3291 - val_acc: 0.8708
Epoch 34/35
 - 0s - loss: 0.1660 - acc: 0.9380 - val_loss: 0.3058 - val_acc: 0.8798
Epoch 35/35
 - 0s - loss: 0.1602 - acc: 0.9407 - val_loss: 0.3190 - val_acc: 0.8734
va acc: 0.873
te acc: 0.8734
2018-10-01 23:03:53.904864 stack:4/5
Train on 16000 samples, validate on 5000 samples
Epoch 1/35
 - 1s - loss: 0.3381 - acc: 0.8539

In [21]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cross_validation import KFold
from gensim.models import Doc2Vec
from collections import OrderedDict

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
import re

# ----------------------- myfunc -----------------------
def myAcc(y_true,y_pred):
    y_pred = np.argmax(y_pred,axis=1)
    return np.mean(y_true == y_pred)

# ----------------------- load dataset -----------------
labeled_train = pd.read_csv('./data/labeledTrainData.tsv', sep='\t', encoding='utf8')
model = Doc2Vec.load('./data/dbow_d2v_2.model')
X_sp = np.array([model.docvecs[i] for i in range(25000)])
labels = labeled_train['sentiment']

# ----------------------dbowd2v stack -------------------
df_stack = pd.DataFrame(index=range(len(labeled_train)))
TR = 20000
n = 5

X = X_sp[:TR]
y = labels[:TR]
X_te = X_sp[TR:]
y_te = labels[TR:]

num_class = len(pd.value_counts(labels))
stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))

for k,(tr,va) in enumerate(KFold(len(y),n_folds=n)):
    print('{} stack:{}/{}'.format(datetime.now(), k+1, n))
    nb_classes = num_class
    X_train = X[tr]
    y_train = y[tr]
    X_test = X_te
    y_test = y_te
    
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    model = Sequential()
    model.add(Dense(300,input_shape=(X_train.shape[1],)))
    model.add(Dropout(0.1))
    model.add(Activation('tanh'))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                      optimizer='adadelta',
                      metrics=['accuracy'])

    history = model.fit(X_train, Y_train,shuffle=True,
                            batch_size=128, nb_epoch=35,
                            verbose=2, validation_data=(X_test, Y_test))
    y_pred_va = model.predict_proba(X[va])
    y_pred_te = model.predict_proba(X_te)
    print('va acc:',myAcc(y[va],y_pred_va))
    print('te acc:',myAcc(y_te,y_pred_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for l in range(stack_all.shape[1]):
    df_stack['dbowd2v_2_{}'.format(l)] = stack_all[:,l]
        
df_stack.to_csv('./data/dbowd2v_stack_2.csv',encoding='utf8',index=None)
print(datetime.now(),'save dbowd2v stack done!')        

2018-10-01 23:08:32.398921 stack:1/5




Train on 16000 samples, validate on 5000 samples
Epoch 1/35
 - 1s - loss: 0.3356 - acc: 0.8546 - val_loss: 0.3361 - val_acc: 0.8592
Epoch 2/35
 - 0s - loss: 0.2896 - acc: 0.8794 - val_loss: 0.3058 - val_acc: 0.8736
Epoch 3/35
 - 0s - loss: 0.2856 - acc: 0.8789 - val_loss: 0.2992 - val_acc: 0.8778
Epoch 4/35
 - 0s - loss: 0.2819 - acc: 0.8830 - val_loss: 0.2971 - val_acc: 0.8794
Epoch 5/35
 - 0s - loss: 0.2805 - acc: 0.8826 - val_loss: 0.3046 - val_acc: 0.8736
Epoch 6/35
 - 0s - loss: 0.2773 - acc: 0.8834 - val_loss: 0.2942 - val_acc: 0.8804
Epoch 7/35
 - 0s - loss: 0.2753 - acc: 0.8844 - val_loss: 0.2960 - val_acc: 0.8810
Epoch 8/35
 - 0s - loss: 0.2730 - acc: 0.8862 - val_loss: 0.2950 - val_acc: 0.8818
Epoch 9/35
 - 0s - loss: 0.2712 - acc: 0.8881 - val_loss: 0.2957 - val_acc: 0.8816
Epoch 10/35
 - 0s - loss: 0.2684 - acc: 0.8876 - val_loss: 0.3019 - val_acc: 0.8766
Epoch 11/35
 - 0s - loss: 0.2678 - acc: 0.8886 - val_loss: 0.3053 - val_acc: 0.8728
Epoch 12/35
 - 0s - loss: 0.2666 - a

Epoch 26/35
 - 0s - loss: 0.2161 - acc: 0.9134 - val_loss: 0.3043 - val_acc: 0.8752
Epoch 27/35
 - 0s - loss: 0.2107 - acc: 0.9167 - val_loss: 0.3144 - val_acc: 0.8720
Epoch 28/35
 - 0s - loss: 0.2037 - acc: 0.9194 - val_loss: 0.3052 - val_acc: 0.8764
Epoch 29/35
 - 0s - loss: 0.1972 - acc: 0.9233 - val_loss: 0.3050 - val_acc: 0.8776
Epoch 30/35
 - 0s - loss: 0.1886 - acc: 0.9257 - val_loss: 0.3106 - val_acc: 0.8784
Epoch 31/35
 - 0s - loss: 0.1869 - acc: 0.9286 - val_loss: 0.3108 - val_acc: 0.8788
Epoch 32/35
 - 0s - loss: 0.1790 - acc: 0.9324 - val_loss: 0.3178 - val_acc: 0.8742
Epoch 33/35
 - 0s - loss: 0.1705 - acc: 0.9369 - val_loss: 0.3206 - val_acc: 0.8774
Epoch 34/35
 - 0s - loss: 0.1676 - acc: 0.9364 - val_loss: 0.3439 - val_acc: 0.8686
Epoch 35/35
 - 0s - loss: 0.1597 - acc: 0.9401 - val_loss: 0.3276 - val_acc: 0.8752
va acc: 0.8745
te acc: 0.8752
2018-10-01 23:09:14.120132 stack:4/5
Train on 16000 samples, validate on 5000 samples
Epoch 1/35
 - 1s - loss: 0.3280 - acc: 0.863

In [22]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cross_validation import KFold
from gensim.models import Doc2Vec
from collections import OrderedDict

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
import re

# ----------------------- myfunc -----------------------
def myAcc(y_true,y_pred):
    y_pred = np.argmax(y_pred,axis=1)
    return np.mean(y_true == y_pred)

# ----------------------- load dataset -----------------
labeled_train = pd.read_csv('./data/labeledTrainData.tsv', sep='\t', encoding='utf8')
model = Doc2Vec.load('./data/dm_d2v.model')
X_sp = np.array([model.docvecs[i] for i in range(25000)])
labels = labeled_train['sentiment']

# ----------------------dmd2v stack -------------------
df_stack = pd.DataFrame(index=range(len(labeled_train)))
TR = 20000
n = 5

X = X_sp[:TR]
y = labels[:TR]
X_te = X_sp[TR:]
y_te = labels[TR:]

num_class = len(pd.value_counts(labels))
stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))

for k,(tr,va) in enumerate(KFold(len(y),n_folds=n)):
    print('{} stack:{}/{}'.format(datetime.now(), k+1, n))
    nb_classes = num_class
    X_train = X[tr]
    y_train = y[tr]
    X_test = X_te
    y_test = y_te
    
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    model = Sequential()
    model.add(Dense(300,input_shape=(X_train.shape[1],)))
    model.add(Dropout(0.1))
    model.add(Activation('tanh'))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                      optimizer='adadelta',
                      metrics=['accuracy'])

    history = model.fit(X_train, Y_train,shuffle=True,
                            batch_size=128, nb_epoch=35,
                            verbose=2, validation_data=(X_test, Y_test))
    y_pred_va = model.predict_proba(X[va])
    y_pred_te = model.predict_proba(X_te)
    print('va acc:',myAcc(y[va],y_pred_va))
    print('te acc:',myAcc(y_te,y_pred_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for l in range(stack_all.shape[1]):
    df_stack['dbowd2v_{}'.format(l)] = stack_all[:,l]
        
df_stack.to_csv('./data/dmd2v_stack.csv',encoding='utf8',index=None)
print(datetime.now(),'save dmd2v stack done!')        

2018-10-01 23:13:51.531822 stack:1/5




Train on 16000 samples, validate on 5000 samples
Epoch 1/35
 - 1s - loss: 0.3700 - acc: 0.8348 - val_loss: 0.3387 - val_acc: 0.8568
Epoch 2/35
 - 0s - loss: 0.3183 - acc: 0.8671 - val_loss: 0.3347 - val_acc: 0.8594
Epoch 3/35
 - 0s - loss: 0.2994 - acc: 0.8737 - val_loss: 0.3407 - val_acc: 0.8592
Epoch 4/35
 - 0s - loss: 0.2872 - acc: 0.8799 - val_loss: 0.3659 - val_acc: 0.8434
Epoch 5/35
 - 0s - loss: 0.2730 - acc: 0.8855 - val_loss: 0.3500 - val_acc: 0.8558
Epoch 6/35
 - 0s - loss: 0.2599 - acc: 0.8918 - val_loss: 0.3506 - val_acc: 0.8574
Epoch 7/35
 - 0s - loss: 0.2457 - acc: 0.8992 - val_loss: 0.3408 - val_acc: 0.8632
Epoch 8/35
 - 0s - loss: 0.2280 - acc: 0.9077 - val_loss: 0.3479 - val_acc: 0.8596
Epoch 9/35
 - 0s - loss: 0.2125 - acc: 0.9161 - val_loss: 0.3466 - val_acc: 0.8590
Epoch 10/35
 - 0s - loss: 0.1951 - acc: 0.9240 - val_loss: 0.3651 - val_acc: 0.8574
Epoch 11/35
 - 0s - loss: 0.1798 - acc: 0.9291 - val_loss: 0.3685 - val_acc: 0.8540
Epoch 12/35
 - 0s - loss: 0.1565 - a

Epoch 26/35
 - 0s - loss: 0.0353 - acc: 0.9890 - val_loss: 0.5152 - val_acc: 0.8618
Epoch 27/35
 - 0s - loss: 0.0306 - acc: 0.9924 - val_loss: 0.5460 - val_acc: 0.8528
Epoch 28/35
 - 0s - loss: 0.0310 - acc: 0.9909 - val_loss: 0.5490 - val_acc: 0.8542
Epoch 29/35
 - 0s - loss: 0.0261 - acc: 0.9929 - val_loss: 0.5365 - val_acc: 0.8602
Epoch 30/35
 - 0s - loss: 0.0223 - acc: 0.9946 - val_loss: 0.5821 - val_acc: 0.8566
Epoch 31/35
 - 0s - loss: 0.0235 - acc: 0.9930 - val_loss: 0.6046 - val_acc: 0.8556
Epoch 32/35
 - 0s - loss: 0.0195 - acc: 0.9949 - val_loss: 0.5684 - val_acc: 0.8598
Epoch 33/35
 - 0s - loss: 0.0175 - acc: 0.9954 - val_loss: 0.5913 - val_acc: 0.8606
Epoch 34/35
 - 0s - loss: 0.0178 - acc: 0.9951 - val_loss: 0.6117 - val_acc: 0.8528
Epoch 35/35
 - 0s - loss: 0.0147 - acc: 0.9958 - val_loss: 0.6080 - val_acc: 0.8594
va acc: 0.84925
te acc: 0.8594
2018-10-01 23:14:33.989746 stack:4/5
Train on 16000 samples, validate on 5000 samples
Epoch 1/35
 - 1s - loss: 0.3729 - acc: 0.83

In [23]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cross_validation import KFold
from gensim.models import Doc2Vec
from collections import OrderedDict

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
import re

# ----------------------- myfunc -----------------------
def myAcc(y_true,y_pred):
    y_pred = np.argmax(y_pred,axis=1)
    return np.mean(y_true == y_pred)

# ----------------------- load dataset -----------------
labeled_train = pd.read_csv('./data/labeledTrainData.tsv', sep='\t', encoding='utf8')
model = Doc2Vec.load('./data/dm_d2v_2.model')
X_sp = np.array([model.docvecs[i] for i in range(25000)])
labels = labeled_train['sentiment']

# ----------------------dmd2v stack -------------------
df_stack = pd.DataFrame(index=range(len(labeled_train)))
TR = 20000
n = 5

X = X_sp[:TR]
y = labels[:TR]
X_te = X_sp[TR:]
y_te = labels[TR:]

num_class = len(pd.value_counts(labels))
stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))

for k,(tr,va) in enumerate(KFold(len(y),n_folds=n)):
    print('{} stack:{}/{}'.format(datetime.now(), k+1, n))
    nb_classes = num_class
    X_train = X[tr]
    y_train = y[tr]
    X_test = X_te
    y_test = y_te
    
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    model = Sequential()
    model.add(Dense(300,input_shape=(X_train.shape[1],)))
    model.add(Dropout(0.1))
    model.add(Activation('tanh'))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                      optimizer='adadelta',
                      metrics=['accuracy'])

    history = model.fit(X_train, Y_train,shuffle=True,
                            batch_size=128, nb_epoch=25,
                            verbose=2, validation_data=(X_test, Y_test))
    y_pred_va = model.predict_proba(X[va])
    y_pred_te = model.predict_proba(X_te)
    print('va acc:',myAcc(y[va],y_pred_va))
    print('te acc:',myAcc(y_te,y_pred_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for l in range(stack_all.shape[1]):
    df_stack['dbowd2v_2_{}'.format(l)] = stack_all[:,l]
        
df_stack.to_csv('./data/dmd2v_stack_2.csv',encoding='utf8',index=None)
print(datetime.now(),'save dmd2v stack done!')        

2018-10-01 23:21:47.005353 stack:1/5




Train on 16000 samples, validate on 5000 samples
Epoch 1/25
 - 1s - loss: 0.3693 - acc: 0.8368 - val_loss: 0.3633 - val_acc: 0.8480
Epoch 2/25
 - 0s - loss: 0.3057 - acc: 0.8709 - val_loss: 0.3474 - val_acc: 0.8526
Epoch 3/25
 - 0s - loss: 0.2790 - acc: 0.8826 - val_loss: 0.3526 - val_acc: 0.8570
Epoch 4/25
 - 0s - loss: 0.2554 - acc: 0.8939 - val_loss: 0.3453 - val_acc: 0.8602
Epoch 5/25
 - 0s - loss: 0.2317 - acc: 0.9067 - val_loss: 0.3548 - val_acc: 0.8520
Epoch 6/25
 - 0s - loss: 0.2057 - acc: 0.9157 - val_loss: 0.3680 - val_acc: 0.8544
Epoch 7/25
 - 0s - loss: 0.1808 - acc: 0.9308 - val_loss: 0.3748 - val_acc: 0.8514
Epoch 8/25
 - 0s - loss: 0.1545 - acc: 0.9442 - val_loss: 0.3790 - val_acc: 0.8594
Epoch 9/25
 - 0s - loss: 0.1347 - acc: 0.9520 - val_loss: 0.3904 - val_acc: 0.8560
Epoch 10/25
 - 0s - loss: 0.1121 - acc: 0.9626 - val_loss: 0.4123 - val_acc: 0.8520
Epoch 11/25
 - 0s - loss: 0.0938 - acc: 0.9687 - val_loss: 0.4273 - val_acc: 0.8524
Epoch 12/25
 - 0s - loss: 0.0808 - a

Epoch 20/25
 - 0s - loss: 0.0283 - acc: 0.9919 - val_loss: 0.5242 - val_acc: 0.8576
Epoch 21/25
 - 0s - loss: 0.0246 - acc: 0.9928 - val_loss: 0.5559 - val_acc: 0.8570
Epoch 22/25
 - 0s - loss: 0.0207 - acc: 0.9949 - val_loss: 0.5651 - val_acc: 0.8576
Epoch 23/25
 - 0s - loss: 0.0176 - acc: 0.9958 - val_loss: 0.5769 - val_acc: 0.8534
Epoch 24/25
 - 0s - loss: 0.0168 - acc: 0.9963 - val_loss: 0.5784 - val_acc: 0.8578
Epoch 25/25
 - 0s - loss: 0.0143 - acc: 0.9970 - val_loss: 0.6166 - val_acc: 0.8526
va acc: 0.84325
te acc: 0.8526
2018-10-01 23:22:30.078991 stack:5/5
Train on 16000 samples, validate on 5000 samples
Epoch 1/25
 - 1s - loss: 0.3765 - acc: 0.8347 - val_loss: 0.3454 - val_acc: 0.8536
Epoch 2/25
 - 0s - loss: 0.3047 - acc: 0.8709 - val_loss: 0.3452 - val_acc: 0.8530
Epoch 3/25
 - 0s - loss: 0.2795 - acc: 0.8829 - val_loss: 0.3395 - val_acc: 0.8556
Epoch 4/25
 - 0s - loss: 0.2570 - acc: 0.8937 - val_loss: 0.3784 - val_acc: 0.8476
Epoch 5/25
 - 0s - loss: 0.2326 - acc: 0.9068 -

In [24]:
df_stack.shape

(25000, 2)

In [25]:
df_stack.head()

Unnamed: 0,dbowd2v_0,dbowd2v_1
0,0.08995,0.91005
1,0.004742,0.995258
2,0.892094,0.107906
3,0.999977,2.3e-05
4,0.991922,0.008078


# Part 3

用全部数据进行Doc2vec的训练，以及对测试集进行预测

In [31]:
data_all = pd.concat([train,unlabeled_train,test]).fillna(0)
data_all.to_csv('./data/data_all.csv',index=None,encoding='utf8')

In [35]:
data_all = pd.read_csv('./data/data_all.csv', encoding='utf-8')

In [37]:
data_all.head()

Unnamed: 0,id,review,sentiment
0,"""5814_8""","""With all this stuff going down at the moment ...",1.0
1,"""2381_9""","""\""The Classic War of the Worlds\"" by Timothy ...",1.0
2,"""7759_3""","""The film starts with a manager (Nicholas Bell...",0.0
3,"""3630_4""","""It must be assumed that those who praised thi...",0.0
4,"""9495_8""","""Superbly trashy and wondrously unpretentious ...",1.0


In [38]:
data_all.tail()

Unnamed: 0,id,review,sentiment
99995,"""2155_10""","""Sony Pictures Classics, I'm looking at you! S...",0.0
99996,"""59_10""","""I always felt that Ms. Merkerson had never go...",0.0
99997,"""2531_1""","""I was so disappointed in this movie. I am ver...",0.0
99998,"""7772_8""","""From the opening sequence, filled with black ...",0.0
99999,"""11465_10""","""This is a great horror film for people who do...",0.0


"watching time chasers , it obvious that it was made by a bunch of friends maybe they were sitting around one day in film school and said , hey , let 's pool our money together and make a really bad movie ! or something like that what ever they said , they still ended up making a really bad movie dull story , bad script , lame acting , poor cinematography , bottom of the barrel stock music , etc all corners were cut , except the one that would have prevented this film 's release life 's like that"