In [1]:
import jieba.posseg as pseg
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import codecs
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

In [3]:
with codecs.open(u'stopwords.txt','r','utf8') as f:
   stopwords=set([line.strip() for line in f])
   
raw_data=pd.read_excel('data_by_grade/middle_essay.xlsx')

target=raw_data.styles
data=raw_data.answer_clean.tolist()

In [14]:
target.value_counts()

写景     1700
状物     1700
写人     1700
散文     1700
叙事     1700
议论文    1700
Name: styles, dtype: int64

In [17]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count(u'。')}
                for text in posts]

class TokenPosGenerator(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        features = np.recarray(shape=(len(posts),),
                               dtype=[('Tokened', object), ('Pos', object),('raw_text',object)])
        for i, text in enumerate(posts):
            pos_list=[]
            word_list=[]
            for w,p in pseg.cut(text):
                word_list.append(w)
                pos_list.append(p)

            features['Tokened'][i] = ' '.join(word_list)
            features['Pos'][i] = pos_list
            features['raw_text'][i] = text
        return features


class PosTranVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, pos_list=None):
        if pos_list is not None:
            self.pos_list = pos_list
        else:
            self.pos_list = [u'a', u'ad', u'ag', u'an', u'b', u'c', u'd', u'df', u'dg', u'e', 
                             u'eng', u'f', u'g', u'h', u'i', u'j', u'k', u'l', u'm', u'mg', 
                             u'mq', u'n', u'ng', u'nr', u'nrfg', u'nrt', u'ns', u'nt', u'nz', 
                             u'o', u'p', u'q', u'r', u'rg', u'rr', u'rz', u's', u't', u'tg', 
                             u'u', u'ud', u'ug', u'uj', u'ul', u'uv', u'uz', u'v', u'vd', u'vg', 
                             u'vi', u'vn', u'vq', u'x', u'y', 'yg', u'z', u'zg']

        self.pos2id = dict((w, i) for i, w in enumerate(self.pos_list))


    def fit(self, x, y=None):
        return self
    
    
    def transform(self, posts):
        N = len(self.pos2id)
        self.pos_features_name = ['_'.join(w) for w in zip(np.repeat(self.pos_list, len(self.pos_list)).tolist(),
                                                           self.pos_list * len(self.pos_list))]
        pos_features = np.empty((len(posts),N**2),dtype=float)
        for i, poslst in enumerate(posts):
            pos_features[i] = self._pos_transition(poslst, N)
        return pos_features
    
    
    def get_feature_names(self):
        return self.pos_features_name
    
    
    def _pos_transition(self, pos, N):
        mat = np.zeros((N, N), dtype=float)
        for i in range(len(pos) - 1):
            index1 = self.pos2id.get(pos[i], None)
            index2 = self.pos2id.get(pos[i + 1], None)
            if index1 is not None and index2 is not None:
                mat[index1, index2] = + 1
        return (mat / np.sum(mat)).reshape((1, N ** 2))

#pos_stater=PosTranVectorizer()
#pos_stater.transform([poslst])
#pos_stater.get_feature_names()

class PosStatVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, pos_list=None):
        if pos_list is not None:
            self.pos_list = pos_list
        else:
            self.pos_list = [u'a', u'ad', u'ag', u'an', u'b', u'c', u'd', u'df', u'dg', u'e', 
                             u'eng', u'f', u'g', u'h', u'i', u'j', u'k', u'l', u'm', u'mg', 
                             u'mq', u'n', u'ng', u'nr', u'nrfg', u'nrt', u'ns', u'nt', u'nz', 
                             u'o', u'p', u'q', u'r', u'rg', u'rr', u'rz', u's', u't', u'tg', 
                             u'u', u'ud', u'ug', u'uj', u'ul', u'uv', u'uz', u'v', u'vd', u'vg', 
                             u'vi', u'vn', u'vq', u'x', u'y', 'yg', u'z', u'zg']
        self.pos2id = dict((w, i) for i, w in enumerate(self.pos_list))
    
    def fit(self, x, y=None):
        return self
        
    def transform(self, posts):
        N = len(self.pos2id)
        pos_features = np.zeros((len(posts),N),dtype=float)
        for i, poslst in enumerate(posts):
            for p in poslst:
                pos_features[i,self.pos2id[p]] += 1
                pos_features[i] /= pos_features[i].sum()  
        return pos_features
        
    def get_feature_names(self):
        return self.pos_list

class Sparse2Mat(BaseEstimator, TransformerMixin): 
    def fit(self, x, y=None):
        return self
    
    def transform(self,sparseMatrix):
        return sparseMatrix.toarray()

#pos_stater=PosStatVectorizer()
#pos_stater.transform([poslst])
#pos_stater.get_feature_names()

In [7]:
feature_union_pipeline = Pipeline([
    #preprocess
    ('textToken', TokenPosGenerator()),
    #feature union
    ('union', FeatureUnion(
        n_jobs=4,
        transformer_list = [
            #extract feature from raw_text
            ('text_stat',Pipeline([
                ('selector', ItemSelector(key = 'raw_text')),
                ('stats', TextStats()),
                ('vect', DictVectorizer())
            ])),
            #extract feature from Tokened
            ('token_tfidf',Pipeline([
                ('selector', ItemSelector(key = 'Tokened')),
                ('tfidf',TfidfVectorizer(max_df=0.90,min_df=5,max_features=1000,stop_words=stopwords))
            ])),
            #extract feature from Pos
            ('pos_tran',Pipeline([
                ('selector', ItemSelector(key = 'Pos')),
                ('pos', PosTranVectorizer())
            ])),
            ('pos_stat',Pipeline([
                ('selector', ItemSelector(key = 'Pos')),
                ('pos', PosStatVectorizer())
            ])),
            
        ]
    )),
    #convert a sparse matrix to array
    ('converter',Sparse2Mat()),
    #normalizatin
    ('scaler', StandardScaler()),
    #feature selection
    
    #classifier
])

In [8]:
feature_data=feature_union_pipeline.fit_transform(data)

In [9]:
unioner=feature_union_pipeline.named_steps['union']
F1 = unioner.transformer_list[0][1].named_steps['vect'].get_feature_names()
F2 = unioner.transformer_list[1][1].named_steps['tfidf'].get_feature_names()
F3 = unioner.transformer_list[2][1].named_steps['pos'].get_feature_names()
feature_names = F1+F2+F3

In [11]:
rf_pipeline = Pipeline([
    ('selector', SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))),
    ('classifier', RandomForestClassifier(n_estimators=500,oob_score=True))
]) 

X_train, X_test, y_train, y_test = train_test_split(feature_data, target,  
                                                    train_size=0.75,  
                                                    test_size=0.25) 

In [12]:
rf_pipeline.fit(X_train,y_train) 
rf_ret_proba=rf_pipeline.predict_proba(X_test)  
rf_ret=rf_pipeline.predict(X_test)     
             
metrics.accuracy_score(y_test, rf_ret)

rf_model=rf_pipeline.named_steps['classifier']
selet_model=rf_pipeline.named_steps['selector']  
selet_model.get_support()#保留的特征

conf_matrix=pd.DataFrame(metrics.confusion_matrix(y_test, rf_ret),index=rf_model.classes_,columns=rf_model.classes_)

In [13]:
metrics.accuracy_score(y_test, rf_ret)

0.57725490196078433

In [15]:
conf_matrix

Unnamed: 0,写人,写景,叙事,散文,状物,议论文
写人,270,12,67,24,24,54
写景,4,344,8,16,28,7
叙事,94,34,198,23,18,46
散文,39,92,43,104,38,117
状物,21,109,17,19,220,37
议论文,24,13,25,18,7,336


In [16]:
selet_model.get_support().sum()#保留的特征

2678

# 初中pipeline

In [None]:
middle_style_classification= Pipeline([
    #preprocess
    ('textToken', TokenPosGenerator()),
    #feature union
    ('union', FeatureUnion(
        n_jobs=4,
        transformer_list = [
            #extract feature from raw_text
            ('text_stat',Pipeline([
                ('selector', ItemSelector(key = 'raw_text')),
                ('stats', TextStats()),
                ('vect', DictVectorizer())
            ])),
            #extract feature from Tokened
            ('token_tfidf',Pipeline([
                ('selector', ItemSelector(key = 'Tokened')),
                ('tfidf',TfidfVectorizer(max_df=0.90,min_df=5,max_features=1000,stop_words=stopwords))
            ])),
            #extract feature from Pos
            ('pos_tran',Pipeline([
                ('selector', ItemSelector(key = 'Pos')),
                ('pos', PosTranVectorizer())
            ])),
            ('pos_stat',Pipeline([
                ('selector', ItemSelector(key = 'Pos')),
                ('pos', PosStatVectorizer())
            ])),
            
        ]
    )),
    #convert a sparse matrix to array
    ('converter',Sparse2Mat()),
    #normalizatin
    ('scaler', StandardScaler()),
    #feature selection
    ('selector', SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))),
    #classifier
    ('classifier', RandomForestClassifier(n_estimators=500,oob_score=True))
])

In [19]:
middle_style_classification.fit(data, target)

Pipeline(steps=[('textToken', TokenPosGenerator()), ('union', FeatureUnion(n_jobs=4,
       transformer_list=[('text_stat', Pipeline(steps=[('selector', ItemSelector(key='raw_text')), ('stats', TextStats()), ('vect', DictVectorizer(dtype=<type 'numpy.float64'>, separator='=', sort=True,
        sparse=True))...imators=500, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False))])

In [21]:
rf_model=middle_style_classification.named_steps['classifier']
selet_model=middle_style_classification.named_steps['selector']  
selet_model.get_support().sum()#保留的特征

2927

In [23]:
pd.Series(rf_model.classes_)

0     写人
1     写景
2     叙事
3     散文
4     状物
5    议论文
dtype: object

In [25]:
middle_style_classification.predict_proba(data[:3])

array([[ 0.758,  0.014,  0.088,  0.04 ,  0.07 ,  0.03 ],
       [ 0.684,  0.01 ,  0.11 ,  0.076,  0.056,  0.064],
       [ 0.752,  0.016,  0.094,  0.036,  0.042,  0.06 ]])

In [26]:
from sklearn.externals import joblib
joblib.dump(middle_style_classification,'styleModel/middle_style_classification.model',compress=3)

['styleModel/middle_style_classification.model']