In [1]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
import shelve
from sklearn import metrics
import time
from sklearn.externals import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model.logistic import LogisticRegression
from mlxtend.classifier import StackingClassifier
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
start = time.time()

In [2]:
load_start = time.time()
shelve_file = shelve.open('../data/sample')
X_train = shelve_file['X_train']
y_train = shelve_file['y_train']
X_test = shelve_file['X_test']
print('Time consumption on loading:',time.time()-load_start)

tfidf = TfidfVectorizer(ngram_range=(1,4), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)
tfidf.fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

Time consumption on loading: 1.3846826553344727


In [3]:
start = time.time()
sgd =  SGDClassifier(loss="modified_huber", alpha=0.00002, n_jobs=-1,penalty = 'l2')
lsvc = LinearSVC(C=0.8, loss='squared_hinge', intercept_scaling=1, tol=0.0002, multi_class='ovr')
rfc =  RandomForestClassifier()
knn =  KNeighborsClassifier()
svc =  SVC(kernel='linear', C=0.8,probability=True)
mnb =  MultinomialNB(alpha = 0.001)
lr = LogisticRegression(penalty = 'l2',C=10)
dtc = DecisionTreeClassifier()
gdbt = GradientBoostingClassifier()
clf1 = sgd
clf2 = lr
clf3 = rfc

sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, use_probas=True,verbose=1)
sclf.fit(X_train,y_train)
prediction = sclf.predict(X_train)
print(metrics.classification_report(y_train,prediction))#分类报告
print(metrics.confusion_matrix(y_train,prediction))#混淆矩阵
#scores = cross_val_score(sclf, X_train, y_train , cv=3, scoring='f1_macro')
#print(scores)
'''
for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['SGD', 
                       'LR', 
                       'RFC',
                       'StackingClassifier']):
 
    scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='f1_macro')
    print("f1_score: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))
'''
print('time comsumption is',time.time()-start)



Fitting 3 classifiers...
Fitting classifier1: sgdclassifier (1/3)
Fitting classifier2: logisticregression (2/3)
Fitting classifier3: multinomialnb (3/3)
             precision    recall  f1-score   support

          1       0.98      0.98      0.98      5341
         10       0.99      0.99      0.99      4927
         11       0.99      0.99      0.99      3514
         12       0.99      0.99      0.99      5242
         13       0.99      0.99      0.99      7818
         14       0.99      0.99      0.99      6693
         15       1.00      1.00      1.00      7082
         16       0.99      0.98      0.99      2981
         17       0.98      0.99      0.99      3058
         18       1.00      1.00      1.00      6970
         19       0.98      0.99      0.98      5483
          2       0.99      0.99      0.99      2889
          3       1.00      1.00      1.00      8217
          4       0.99      0.99      0.99      3751
          5       0.99      0.98      0.99      233

In [4]:
#交叉验证
vali_start = time.time()
scores = cross_val_score(sclf, X_train, y_train , cv=3, scoring='f1_macro')#交叉验证
print(scores) #各组分数
print(scores.mean(),'+/-',scores.std()*2) #平均分
print("Time consumption on cv",time.time( ) - vali_start)

Fitting 3 classifiers...
Fitting classifier1: sgdclassifier (1/3)
Fitting classifier2: logisticregression (2/3)
Fitting classifier3: multinomialnb (3/3)
Fitting 3 classifiers...
Fitting classifier1: sgdclassifier (1/3)
Fitting classifier2: logisticregression (2/3)
Fitting classifier3: multinomialnb (3/3)
Fitting 3 classifiers...
Fitting classifier1: sgdclassifier (1/3)
Fitting classifier2: logisticregression (2/3)
Fitting classifier3: multinomialnb (3/3)
[ 0.76148277  0.7617295   0.76030189]
0.761171385819 +/- 0.0012460435034
Time consumption on cv 3966.495723247528


In [5]:

save_start = time.time()
joblib.dump(sclf, 'model/master.m', compress=3)#保存模型
print("Time consumption on save",time.time( ) - save_start)


Time consumption on save 107.34028768539429


In [6]:
test_start = time.time()
#预测
prediction = sclf.predict(X_test)
print('Time comsumption on test',time.time()-test_start)
#保存结果
f_out = open('stacking.csv', 'w')
f_out.write("id,class"+"\n")
for i in range(X_test.shape[0]):
    f_out.write(str(i)+","+str(prediction[i])+'\n')
f_out.close()
print('Total Time comsumption:',time.time()-start)

Time comsumption on test 22.84069037437439
Total Time comsumption: 6257.039233207703
