In [1]:
import sys
sys.path.insert(0, '..')
import os

import numpy as np
import pandas as pd

In [2]:
import zipfile
with zipfile.ZipFile('./data/labeledTrainData.tsv.zip', 'r') as z:
    z.extractall('./data/')

In [3]:
data_train = pd.read_csv('./data/labeledTrainData.tsv', sep='\t')

In [4]:
print(data_train.shape)
data_train.head(5)

(25000, 3)


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
data_train['review'][0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [6]:
data_train.review.shape[0]

25000

In [7]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets 
    Every dataset is lower cased
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()

stops = set(stopwords.words("english"))  
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, "lxml").get_text()  
    words = clean_str(review_text)      
    meaningful_words = [w for w in words.split(' ') if not w in stops]   
    return(" ".join(meaningful_words))   

In [8]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
review_to_words(data_train['review'][0])

"stuff going moment mj 've started listening music , watching odd documentary , watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography , part feature film remember going see cinema originally released subtle messages mj 's feeling towards press also obvious message drugs bad m'kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts 20 minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans \\? nah , joe pesci 's character ranted wanted people know supplying drugs etc dunno , maybe hates mj 's music lots cool things like mj turning car robot whole speed demon sequence also , director must patience saint came filming kiddy bad sequ

In [10]:
texts = []
labels = []
for idx in range(data_train.review.shape[0]):
    text = review_to_words(data_train.review[idx])
    texts.append(text)
    labels.append(data_train.sentiment[idx])

In [11]:
texts[0]

"stuff going moment mj 've started listening music , watching odd documentary , watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography , part feature film remember going see cinema originally released subtle messages mj 's feeling towards press also obvious message drugs bad m'kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts 20 minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans \\? nah , joe pesci 's character ranted wanted people know supplying drugs etc dunno , maybe hates mj 's music lots cool things like mj turning car robot whole speed demon sequence also , director must patience saint came filming kiddy bad sequ

# Machine Learning

## CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=1.0, min_df=5)
train_data_features = vectorizer.fit_transform(texts)

In [16]:
train_data_features.shape

(25000, 27215)

In [132]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators =250, n_jobs=-1)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.8614

In [124]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=500)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.863

In [142]:
clf = AdaBoostClassifier(n_estimators=1000)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.8642

In [143]:
clf = AdaBoostClassifier(n_estimators=2000)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.8606

In [135]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=500)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.8558

In [137]:
clf = GradientBoostingClassifier(n_estimators=1000)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.8676

In [138]:
clf = GradientBoostingClassifier(n_estimators=1500)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.8686

In [139]:
clf = GradientBoostingClassifier(n_estimators=2000)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.8722

In [140]:
clf = GradientBoostingClassifier(n_estimators=3000)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.8748

In [141]:
clf = GradientBoostingClassifier(n_estimators=5000)
clf.fit(train_data_features[:20000],labels[:20000]).score(train_data_features[20000:], labels[20000:])

0.8772

## TfidfVectorizer

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=3)
tfidf_features = tfidf_vectorizer.fit_transform(texts)

In [23]:
tfidf_features.shape

(25000, 27215)

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

scores = cross_val_score(RandomForestClassifier(n_estimators = 100),
                         tfidf_features, labels, cv=5, n_jobs=-1)
print(scores,np.mean(scores))

[0.8508 0.8534 0.8414 0.8588 0.8526] 0.8514000000000002


In [58]:
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(C=3),
                         tfidf_features, labels, cv=5, n_jobs=-1)
print(scores,np.mean(scores))

[0.8962 0.8986 0.8892 0.9014 0.8902] 0.89512


In [72]:
from sklearn.naive_bayes import MultinomialNB

scores = cross_val_score(MultinomialNB(alpha=1),
                         tfidf_features, labels, cv=5, n_jobs=-1)
print(scores,np.mean(scores))

[0.8636 0.8678 0.8642 0.8718 0.8596] 0.8654


In [84]:
from sklearn.naive_bayes import BernoulliNB

scores = cross_val_score(BernoulliNB(alpha=1),
                         tfidf_features, labels, cv=5, n_jobs=-1)
print(scores,np.mean(scores))

[0.842  0.8448 0.8532 0.8526 0.8432] 0.8471599999999999


In [94]:
from sklearn.neighbors import KNeighborsClassifier

scores = cross_val_score(KNeighborsClassifier(n_neighbors=100),
                         tfidf_features, labels, cv=5, n_jobs=-1)
print(scores,np.mean(scores))

[0.8124 0.8186 0.8134 0.8256 0.8112] 0.81624


In [107]:
from sklearn.svm import LinearSVC

scores = cross_val_score(LinearSVC(C=0.3),
                         tfidf_features, labels, cv=5, n_jobs=-1)
print(scores,np.mean(scores))

[0.8988 0.8998 0.8872 0.9004 0.8918] 0.8956


In [95]:
from sklearn.svm import SVC

scores = cross_val_score(SVC(C=1),
                         tfidf_features, labels, cv=5, n_jobs=-1)
print(scores,np.mean(scores))

[0.615  0.6334 0.674  0.6322 0.6326] 0.63744


## GridSearchCV

In [49]:
from sklearn.model_selection import GridSearchCV, cross_val_score
Cs = np.array([0.01,0.1,1,2,3,4,5])
lr = LogisticRegression()
clf = GridSearchCV(estimator=lr, param_grid=dict(C=Cs),
                    n_jobs=-1)
clf.fit(tfidf_features[:20000], labels[:20000])

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([0.01, 0.1 , 1.  , 2.  , 3.  , 4.  , 5.  ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [50]:
print(clf.best_score_)
print(clf.best_estimator_)
print(clf.best_estimator_.C)

0.8884
LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
5.0


In [52]:
parameters = {'C':[1,3,5,8,10], 'tol':[1e-6,1e-5,1e-4,1e-3]}
clf = GridSearchCV(lr, parameters, cv=5, n_jobs=-1)
clf.fit(tfidf_features[:20000], labels[:20000])

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [1, 3, 5, 8, 10], 'tol': [1e-06, 1e-05, 0.0001, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
print(clf.best_score_)
print(clf.best_estimator_)
print(clf.best_estimator_.C)

0.891
LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)
5


## Pipeline

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([('vect', TfidfVectorizer(max_df=0.95, min_df=3)),
                     ('clf', LogisticRegression(C=3)),
])
text_clf.fit(texts[:20000], labels[:20000])  
text_clf.score(texts[20000:],labels[20000:])

0.8892

In [25]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'clf__C': (1,3),
}

In [26]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(texts[:20000], labels[:20000])
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.89045
clf__C: 3
vect__ngram_range: (1, 2)


## Stacking

In [145]:
from sklearn.model_selection import KFold
from datetime import datetime

def myAcc(y_true,y_pred):
    y_pred = np.argmax(y_pred,axis=1)
    return np.mean(y_true == y_pred)

df_stack = pd.DataFrame(index=range(len(data_train)))

tfv = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True)
X_sp = tfv.fit_transform(texts)

lable = np.array(data_train['sentiment'])

num_class = len(pd.value_counts(data_train['sentiment']))
TR = 20000
n = 5

X = X_sp[:TR]
y = lable[:TR]
X_te = X_sp[TR:]
y_te = lable[TR:]

stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))

for i,(tr,va) in enumerate(KFold(n_splits=n).split(X)):
    print('%s stack:%d/%d'%(str(datetime.now()),i+1,n))
    clf = LogisticRegression(C=3)
    clf.fit(X[tr],y[tr])
    y_pred_va = clf.predict_proba(X[va])
    y_pred_te = clf.predict_proba(X_te)
    print('va acc:',myAcc(y[va],y_pred_va))
    print('te acc:',myAcc(y_te,y_pred_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for i in range(stack_all.shape[1]):
    df_stack['tfidf_lr_{}'.format(i)] = stack_all[:,i]
df_stack.to_csv('./data/' + 'tfidf_lr.csv',index=None,encoding='utf8')
print(datetime.now(),'save lr stack done!')

2018-09-26 22:33:18.870093 stack:1/5
va acc: 0.8905
te acc: 0.8888
2018-09-26 22:33:19.202204 stack:2/5
va acc: 0.89575
te acc: 0.89
2018-09-26 22:33:19.523345 stack:3/5
va acc: 0.8845
te acc: 0.8852
2018-09-26 22:33:19.846480 stack:4/5
va acc: 0.8895
te acc: 0.889
2018-09-26 22:33:20.169615 stack:5/5
va acc: 0.89175
te acc: 0.8844
2018-09-26 22:33:20.574535 save lr stack done!


In [60]:
from sklearn.model_selection import KFold
from datetime import datetime

def myAcc(y_true,y_pred):
    y_pred = np.argmax(y_pred,axis=1)
    return np.mean(y_true == y_pred)

df_stack = pd.DataFrame(index=range(len(data_train)))

tfv = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True)
X_sp = tfv.fit_transform(texts)

lable = np.array(data_train['sentiment'])

num_class = len(pd.value_counts(data_train['sentiment']))
TR = 20000
n = 5

X = X_sp[:TR]
y = lable[:TR]
X_te = X_sp[TR:]
y_te = lable[TR:]

stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))

for i,(tr,va) in enumerate(KFold(n_splits=n).split(X)):
    print('%s stack:%d/%d'%(str(datetime.now()),i+1,n))
    clf = LogisticRegression(C=1)
    clf.fit(X[tr],y[tr])
    y_pred_va = clf.predict_proba(X[va])
    y_pred_te = clf.predict_proba(X_te)
    print('va acc:',myAcc(y[va],y_pred_va))
    print('te acc:',myAcc(y_te,y_pred_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for i in range(stack_all.shape[1]):
    df_stack['tfidf_lr1_{}'.format(i)] = stack_all[:,i]
df_stack.to_csv('./data/' + 'tfidf_lr1.csv',index=None,encoding='utf8')
print(datetime.now(),'save lr1 stack done!')

2018-09-27 17:41:06.863807 stack:1/5
va acc: 0.88875
te acc: 0.8802
2018-09-27 17:41:07.103139 stack:2/5
va acc: 0.8955
te acc: 0.8834
2018-09-27 17:41:07.342498 stack:3/5
va acc: 0.8805
te acc: 0.8834
2018-09-27 17:41:07.578555 stack:4/5
va acc: 0.8825
te acc: 0.887
2018-09-27 17:41:07.817914 stack:5/5
va acc: 0.89125
te acc: 0.882
2018-09-27 17:41:08.127642 save lr1 stack done!


In [64]:
from sklearn.model_selection import KFold
from datetime import datetime

def myAcc(y_true,y_pred):
    y_pred = np.argmax(y_pred,axis=1)
    return np.mean(y_true == y_pred)

df_stack = pd.DataFrame(index=range(len(data_train)))

tfv = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True)
X_sp = tfv.fit_transform(texts)

lable = np.array(data_train['sentiment'])

num_class = len(pd.value_counts(data_train['sentiment']))
TR = 20000
n = 5

X = X_sp[:TR]
y = lable[:TR]
X_te = X_sp[TR:]
y_te = lable[TR:]

stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))

for i,(tr,va) in enumerate(KFold(n_splits=n).split(X)):
    print('%s stack:%d/%d'%(str(datetime.now()),i+1,n))
    clf = LogisticRegression(C=5)
    clf.fit(X[tr],y[tr])
    y_pred_va = clf.predict_proba(X[va])
    y_pred_te = clf.predict_proba(X_te)
    print('va acc:',myAcc(y[va],y_pred_va))
    print('te acc:',myAcc(y_te,y_pred_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for i in range(stack_all.shape[1]):
    df_stack['tfidf_lr2_{}'.format(i)] = stack_all[:,i]
df_stack.to_csv('./data/' + 'tfidf_lr2.csv',index=None,encoding='utf8')
print(datetime.now(),'save lr2 stack done!')

2018-09-27 17:44:03.558771 stack:1/5
va acc: 0.89075
te acc: 0.887
2018-09-27 17:44:03.922797 stack:2/5
va acc: 0.89675
te acc: 0.8894
2018-09-27 17:44:04.334694 stack:3/5
va acc: 0.88325
te acc: 0.884
2018-09-27 17:44:04.753577 stack:4/5
va acc: 0.88925
te acc: 0.8896
2018-09-27 17:44:05.164477 stack:5/5
va acc: 0.8925
te acc: 0.8828
2018-09-27 17:44:05.690091 save lr2 stack done!


In [47]:
df_stack.head(5)

Unnamed: 0,tfidf_lr_0,tfidf_lr_1
0,0.549401,0.450599
1,0.16572,0.83428
2,0.723276,0.276724
3,0.746109,0.253891
4,0.745254,0.254746


In [144]:
from sklearn.model_selection import KFold
from datetime import datetime

def myAcc(y_true,y_pred):
    y_pred = np.argmax(y_pred,axis=1)
    return np.mean(y_true == y_pred)

df_stack = pd.DataFrame(index=range(len(data_train)))

tfv = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True)
X_sp = tfv.fit_transform(texts)

lable = np.array(data_train['sentiment'])

num_class = len(pd.value_counts(data_train['sentiment']))
TR = 20000
n = 5

X = X_sp[:TR]
y = lable[:TR]
X_te = X_sp[TR:]
y_te = lable[TR:]

stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))

for i,(tr,va) in enumerate(KFold(n_splits=n).split(X)):
    print('%s stack:%d/%d'%(str(datetime.now()),i+1,n))
    clf = MultinomialNB(alpha=1)
    clf.fit(X[tr],y[tr])
    y_pred_va = clf.predict_proba(X[va])
    y_pred_te = clf.predict_proba(X_te)
    print('va acc:',myAcc(y[va],y_pred_va))
    print('te acc:',myAcc(y_te,y_pred_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for i in range(stack_all.shape[1]):
    df_stack['tfidf_nb_{}'.format(i)] = stack_all[:,i]
df_stack.to_csv('./data/' + 'tfidf_nb.csv',index=None,encoding='utf8')
print(datetime.now(),'save nb stack done!')

2018-09-26 22:32:55.138456 stack:1/5
va acc: 0.8605
te acc: 0.8546
2018-09-26 22:32:55.176384 stack:2/5
va acc: 0.863
te acc: 0.8588
2018-09-26 22:32:55.212259 stack:3/5
va acc: 0.86225
te acc: 0.8578
2018-09-26 22:32:55.248190 stack:4/5
va acc: 0.86675
te acc: 0.8626
2018-09-26 22:32:55.284067 stack:5/5
va acc: 0.8665
te acc: 0.86
2018-09-26 22:32:55.410785 save nb stack done!


In [146]:
from sklearn.model_selection import KFold
from datetime import datetime

def myAcc(y_true,y_pred):
    y_pred = np.argmax(y_pred,axis=1)
    return np.mean(y_true == y_pred)

df_stack = pd.DataFrame(index=range(len(data_train)))

tfv = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True)
X_sp = tfv.fit_transform(texts)

lable = np.array(data_train['sentiment'])

num_class = len(pd.value_counts(data_train['sentiment']))
TR = 20000
n = 5

X = X_sp[:TR]
y = lable[:TR]
X_te = X_sp[TR:]
y_te = lable[TR:]

stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))

for i,(tr,va) in enumerate(KFold(n_splits=n).split(X)):
    print('%s stack:%d/%d'%(str(datetime.now()),i+1,n))
    clf = GradientBoostingClassifier(n_estimators=2000)
    clf.fit(X[tr],y[tr])
    y_pred_va = clf.predict_proba(X[va])
    y_pred_te = clf.predict_proba(X_te)
    print('va acc:',myAcc(y[va],y_pred_va))
    print('te acc:',myAcc(y_te,y_pred_te))
    stack[va] += y_pred_va
    stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for i in range(stack_all.shape[1]):
    df_stack['tfidf_gbdt_{}'.format(i)] = stack_all[:,i]
df_stack.to_csv('./data/' + 'tfidf_gbdt.csv',index=None,encoding='utf8')
print(datetime.now(),'save nb stack done!')

2018-09-26 22:34:12.305507 stack:1/5
va acc: 0.86375
te acc: 0.864
2018-09-26 22:46:45.367303 stack:2/5
va acc: 0.868
te acc: 0.8646
2018-09-26 22:59:23.062333 stack:3/5
va acc: 0.85425
te acc: 0.8608
2018-09-26 23:11:49.302414 stack:4/5
va acc: 0.85475
te acc: 0.8594
2018-09-26 23:24:18.212878 stack:5/5
va acc: 0.86725
te acc: 0.8624
2018-09-26 23:36:51.409984 save nb stack done!


In [148]:
import xgboost as xgb

def xgb_acc_score(preds,dtrain):
    y_true = dtrain.get_label()
    y_pred = np.argmax(preds,axis=1)
    return [('acc',np.mean(y_true == y_pred))]

df_lr = pd.read_csv('./data/' + 'tfidf_lr.csv')
df_nb = pd.read_csv('./data/' + 'tfidf_nb.csv')
df_gbdt = pd.read_csv('./data/' + 'tfidf_gbdt.csv')

df = pd.concat([df_lr,df_nb,df_gbdt], axis=1)
print(df.columns)

lable = np.array(data_train['sentiment'])
num_class = len(pd.value_counts(data_train['sentiment']))
TR = 20000
seed = 10

X = df.iloc[:TR]
y = lable[:TR]
X_te = df.iloc[TR:]
y_te = lable[TR:]


esr = 25
evals = 1
n_trees = 1000

ss = 0.5
mc = 0.8
md = 7
gm = 1
# n_trees = 25

params = {
    "objective": "multi:softprob",
    "booster": "gbtree",
    # "eval_metric": "merror",
    "num_class":num_class,
    'max_depth':md,
    'min_child_weight':mc,
    'subsample':ss,
    'colsample_bytree':1,
    'gamma':gm,
    "eta": 0.01,
    "lambda":0,
    'alpha':0,
    "silent": 1,
    # 'seed':seed,
}

dtrain = xgb.DMatrix(X, y)
dvalid = xgb.DMatrix(X_te, y_te)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
bst = xgb.train(params, dtrain, n_trees, evals=watchlist, feval=xgb_acc_score, maximize=True,
                early_stopping_rounds=esr, verbose_eval=evals)

Index(['tfidf_lr_0', 'tfidf_lr_1', 'tfidf_nb_0', 'tfidf_nb_1', 'tfidf_gbdt_0',
       'tfidf_gbdt_1'],
      dtype='object')
[0]	train-merror:0.10065	eval-merror:0.1156	train-acc:0.89935	eval-acc:0.8844
Multiple eval metrics have been passed: 'eval-acc' will be used for early stopping.

Will train until eval-acc hasn't improved in 25 rounds.
[1]	train-merror:0.09905	eval-merror:0.1128	train-acc:0.90095	eval-acc:0.8872
[2]	train-merror:0.0977	eval-merror:0.113	train-acc:0.9023	eval-acc:0.887
[3]	train-merror:0.0982	eval-merror:0.112	train-acc:0.9018	eval-acc:0.888
[4]	train-merror:0.09775	eval-merror:0.112	train-acc:0.90225	eval-acc:0.888
[5]	train-merror:0.0975	eval-merror:0.1108	train-acc:0.9025	eval-acc:0.8892
[6]	train-merror:0.0974	eval-merror:0.1126	train-acc:0.9026	eval-acc:0.8874
[7]	train-merror:0.0975	eval-merror:0.1114	train-acc:0.9025	eval-acc:0.8886
[8]	train-merror:0.09795	eval-merror:0.1114	train-acc:0.90205	eval-acc:0.8886
[9]	train-merror:0.09765	eval-merror:0.1114	trai

In [63]:
import xgboost as xgb

def xgb_acc_score(preds,dtrain):
    y_true = dtrain.get_label()
    y_pred = np.argmax(preds,axis=1)
    return [('acc',np.mean(y_true == y_pred))]

df_lr = pd.read_csv('./data/' + 'tfidf_lr.csv')
df_nb = pd.read_csv('./data/' + 'tfidf_lr1.csv')
df_gbdt = pd.read_csv('./data/' + 'tfidf_lr2.csv')

df = pd.concat([df_lr,df_nb,df_gbdt], axis=1)
print(df.columns)

lable = np.array(data_train['sentiment'])
num_class = len(pd.value_counts(data_train['sentiment']))
TR = 20000
seed = 10

X = df.iloc[:TR]
y = lable[:TR]
X_te = df.iloc[TR:]
y_te = lable[TR:]


esr = 25
evals = 1
n_trees = 1000

ss = 0.5
mc = 0.8
md = 7
gm = 1
# n_trees = 25

params = {
    "objective": "multi:softprob",
    "booster": "gbtree",
    # "eval_metric": "merror",
    "num_class":num_class,
    'max_depth':md,
    'min_child_weight':mc,
    'subsample':ss,
    'colsample_bytree':1,
    'gamma':gm,
    "eta": 0.01,
    "lambda":0,
    'alpha':0,
    "silent": 1,
    # 'seed':seed,
}

dtrain = xgb.DMatrix(X, y)
dvalid = xgb.DMatrix(X_te, y_te)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
bst = xgb.train(params, dtrain, n_trees, evals=watchlist, feval=xgb_acc_score, maximize=True,
                early_stopping_rounds=esr, verbose_eval=evals)

Index(['tfidf_lr_0', 'tfidf_lr_1', 'tfidf_lr1_0', 'tfidf_lr1_1', 'tfidf_lr2_0',
       'tfidf_lr2_1'],
      dtype='object')
[0]	train-merror:0.10205	eval-merror:0.1142	train-acc:0.89795	eval-acc:0.8858
Multiple eval metrics have been passed: 'eval-acc' will be used for early stopping.

Will train until eval-acc hasn't improved in 25 rounds.
[1]	train-merror:0.10165	eval-merror:0.1116	train-acc:0.89835	eval-acc:0.8884
[2]	train-merror:0.102	eval-merror:0.1136	train-acc:0.898	eval-acc:0.8864
[3]	train-merror:0.10125	eval-merror:0.1126	train-acc:0.89875	eval-acc:0.8874
[4]	train-merror:0.1008	eval-merror:0.1126	train-acc:0.8992	eval-acc:0.8874
[5]	train-merror:0.10075	eval-merror:0.1128	train-acc:0.89925	eval-acc:0.8872
[6]	train-merror:0.10055	eval-merror:0.1136	train-acc:0.89945	eval-acc:0.8864
[7]	train-merror:0.1002	eval-merror:0.114	train-acc:0.8998	eval-acc:0.886
[8]	train-merror:0.0997	eval-merror:0.1134	train-acc:0.9003	eval-acc:0.8866
[9]	train-merror:0.09995	eval-merror:0.1138	