## 1. 导入库

In [31]:
import sklearn

In [32]:
sklearn.__version__

'0.19.1'

In [33]:
# https://blog.csdn.net/xiaosa_kun/article/details/84868437

In [34]:
import sys
sys.path.append("D:/ML_Study/2017-CCF-BDCI-AIJudge")
import pandas as pd
from config.db_config import Config
import numpy as np
from utils import LOGGER

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import codecs

## 2.加载数据

In [35]:
config = Config()
config.data_csv_path = "D:/ML_Study/2017-CCF-BDCI-AIJudge/data/output/corpus/data.csv"

In [36]:
train_df = pd.read_csv(config.data_csv_path,encoding='utf-8',sep=',')
train_df = train_df.dropna()
train_df['penalty'] = train_df['penalty'].astype(int)
train_df.head()

Unnamed: 0,id,content,laws,penalty
0,16,公诉 机关 霍邱县 人民检察院 被告人 许某 1975 日生 2012 因涉嫌 危险 驾驶 ...,1337273,3
1,32,公诉 机关 海口市 龙华区 人民检察院 被告人 王某 海口市 龙华区 人民检察院 海龙 检公...,347675264,1
2,41,公诉 机关 广东省 潮州市 人民检察院 被告人 覃学彬 1980 出生 广西壮族自治区 大新...,2632552535556,5
3,57,公诉 机关 榆林市 榆阳区 人民检察院 上诉人 原审 被告人 2012 因涉嫌 盗窃罪 榆林...,2645253677273,5
4,60,公诉 机关 榆阳区 人民检察院 上诉人 原审 被告人 刘某 汉族 陕西省 横山县 小学文化 ...,2242526275272,7


In [37]:
train_df.shape

(200, 4)

## 查看训练集每个分类的名字以及样本数量

In [38]:
# 类别：数据不均衡，需要进行采样 to do
for name,group in train_df.groupby(by='penalty'):
    print(name,len(group))

1 36
2 50
3 18
4 14
5 16
6 12
7 48
8 6


In [39]:
words_list=[]

In [40]:
for document in train_df['content']:
    words_list.append(document.split())

In [41]:
print(words_list[0:2])

[['公诉', '机关', '霍邱县', '人民检察院', '被告人', '许某', '1975', '日生', '2012', '因涉嫌', '危险', '驾驶', '霍邱县', '公安局', '取保候审', '2013', '日经', '本院', '取保候审', '霍邱县', '人民检察院', '以霍检', '刑诉', '2013', '起诉书', '指控', '被告人', '许某', '甲犯', '危险', '驾驶', '2013', '日向', '本院', '提起公诉', '本院', '依法', '简易程序', '实行', '独任', '审判', '2013', '公开', '开庭审理', '本案', '霍邱县', '人民检察院', '检察员', '胡涛', '被告人', '许某', '到庭', '参加', '诉讼', '现已', '审理', '终结', '霍邱县', '人民检察院', '指控', '2012', '被告人', '许某', '酒后', '驾驶', '二轮', '摩托车', '沿霍寿路', '由南向北', '行驶', '霍寿路', '公园路', '交叉口', '路边', '行人', '相撞', '公安民警', '查获', '六安市', '疾病', '预防', '控制中心', '鉴定', '许某', '血液', '乙醇', '含量', '169.64', 'mg', '100ml', '上述事实', '被告人', '开庭审理', '过程', '无异议', '被害人', '杨正响', '陈述', '证人', '李某', '证言', '六安市', '疾病', '预防', '控制中心', '检验', '报告', '六安市', '疾控交', '检字', '2012', '155', '霍邱县', '公安局', '交通管理', '大队', '呼吸', '酒精', '检测', '抽取', '当事人', '血样', '登记表', '驾驶', '信息', '查询', '道路', '交通事故', '赔偿', '调解', '协议书', '经济', '赔偿', '凭证', '谅解', '被告人', '户籍', '信息', '证据', '证明', '足以认定'], ['公诉', '机关', '海口市', '龙华区', '人民检察院', '被告人', '王某', '海口市

## 3. 特征工程
TfidfVectorizer模型

In [42]:
#调用sklearn.feature_extraction.text库的TfidfVectorizer方法实例化模型对象。
#TfidfVectorizer方法需要4个参数。
#第1个参数是分词结果，数据类型为列表，其中的元素也为列表；
#第2个关键字参数stop_words是停顿词，数据类型为列表；
#第3个关键字参数min_df是词频低于此值则忽略，数据类型为int或float;
#第4个关键字参数max_df是词频高于此值则忽略，数据类型为Int或float。
#查看TfidfVectorizer方法的更多参数用法，
#官方文档链接：http://sklearn.apachecn.org/cn/0.19.0/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [43]:
tfidf=TfidfVectorizer(words_list,min_df=3,max_df=0.95)

In [44]:
#第1行代码查看向量化的维数，即特征的维数；
#第2行代码调用TfidfVectorizer对象的fit_transform方法获得特征矩阵赋值给X；
#第3行代码查看特征矩阵的形状。
X=tfidf.fit_transform(train_df['content'])
print('词表大小:', len(tfidf.vocabulary_))
print(X.shape)

词表大小: 1343
(200, 1343)


In [45]:
#tfidf.vocabulary_

In [46]:
X.toarray()[0:2]

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.26373434, 0.        ,
        0.14269755]])

## 4. 模型训练

### 4.1 标签编码

In [47]:
np.unique(train_df['penalty'])

array([1, 2, 3, 4, 5, 6, 7, 8])

In [48]:
#调用sklearn.preprocessing库的LabelEncoder方法对文章分类做标签编码。
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['penalty'])

In [49]:
y.shape

(200,)

In [50]:
y

array([2, 0, 4, 4, 6, 0, 6, 1, 1, 1, 3, 7, 1, 7, 4, 1, 5, 0, 6, 6, 6, 1,
       6, 2, 3, 1, 1, 6, 7, 1, 2, 6, 0, 6, 1, 3, 1, 6, 0, 0, 1, 1, 6, 2,
       6, 3, 1, 2, 5, 6, 0, 1, 6, 4, 1, 6, 1, 1, 0, 0, 1, 1, 3, 5, 1, 4,
       1, 0, 5, 2, 0, 1, 0, 2, 4, 0, 5, 4, 6, 6, 5, 6, 6, 0, 4, 2, 1, 6,
       6, 3, 2, 1, 6, 0, 0, 6, 6, 0, 3, 0, 2, 0, 4, 4, 6, 0, 6, 1, 1, 1,
       3, 7, 1, 7, 4, 1, 5, 0, 6, 6, 6, 1, 6, 2, 3, 1, 1, 6, 7, 1, 2, 6,
       0, 6, 1, 3, 1, 6, 0, 0, 1, 1, 6, 2, 6, 3, 1, 2, 5, 6, 0, 1, 6, 4,
       1, 6, 1, 1, 0, 0, 1, 1, 3, 5, 1, 4, 1, 0, 5, 2, 0, 1, 0, 2, 4, 0,
       5, 4, 6, 6, 5, 6, 6, 0, 4, 2, 1, 6, 6, 3, 2, 1, 6, 0, 0, 6, 6, 0,
       3, 0], dtype=int64)

### 4.2 逻辑回归模型
调用sklearn.linear_model库的LogisticRegression方法实例化模型对象。  <br>
调用sklearn.model_selection库的train_test_split方法划分训练集和测试集。

In [51]:
def micro_avg_f1(y_true, y_pred):
    '''
    分类 评估函数
    :param y_true: 样本实际类别
    :param y_pred: 预测类别
    :return:
    '''
    return f1_score(y_true, y_pred, average='micro')

In [52]:
X_train,X_test,y_train,y_test= train_test_split(X, y, test_size=0.3)

In [53]:
X_train.shape

(140, 1343)

发现X_train 中在小量的数据集合中1343 维度，当数据比较大的时候，维度会极大增长，可以通过LR等模型进行特征抽取并特征融合，最终不同模型抽取
的特征进行xgboost模型训练，可以起到很好的效果

In [54]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
lr.fit(X_train, y_train)
y_pred = lr.predict_proba(X_test)
y_pred_score = micro_avg_f1(y_test,lr.predict(X_test))
y_pred_score

0.6166666666666667

In [55]:
y_pred

array([[0.25172941, 0.1163091 , 0.0441257 , 0.03587257, 0.04831882,
        0.04172943, 0.42599942, 0.03591555],
       [0.1112396 , 0.4791083 , 0.05380893, 0.0727586 , 0.06158119,
        0.05658974, 0.12834826, 0.03656538],
       [0.07822829, 0.55665559, 0.17920372, 0.03018259, 0.04340848,
        0.02989947, 0.05318464, 0.02923722],
       [0.20160887, 0.18495437, 0.07713558, 0.05319286, 0.15353556,
        0.06819826, 0.21529086, 0.04608364],
       [0.11231275, 0.12139835, 0.06456061, 0.04548143, 0.05955594,
        0.07963159, 0.45002465, 0.06703468],
       [0.17124143, 0.1770872 , 0.07022411, 0.04965321, 0.167869  ,
        0.07659912, 0.24256407, 0.04476186],
       [0.07473505, 0.57600465, 0.15502355, 0.03104757, 0.04422462,
        0.03090738, 0.05836906, 0.02968811],
       [0.16511812, 0.29619346, 0.06178228, 0.04482196, 0.06622193,
        0.09717071, 0.22517938, 0.04351216],
       [0.08905819, 0.52533818, 0.17355755, 0.03240536, 0.05166649,
        0.03301305, 0.063204

In [56]:
lr.score(X_test,y_test)

0.6166666666666667

### 4. 3 交叉验证

In [57]:
skf = StratifiedKFold(n_splits=2, random_state=config.seed)

In [58]:
for train_index, test_index in skf.split(X_train, y_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_data,val_data=X_train[train_index],X_train[test_index]
    train_y,val_y=y_train[train_index],y_train[test_index]
    print("Train_dataset...")
    print(train_data.toarray(),train_y)
    print("Validate_dataset...")
    print(val_data.toarray(),val_y)
    lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    lr.fit(train_data, train_y)
    y_pred_val=lr.predict(val_data)
    print("y_true y_pred.....")
    print(val_y,y_pred_val)
    y_pred_score = micro_avg_f1(val_y,y_pred_val)
    y_pred_score
    print("y_pred_score={}".format(y_pred_score))

TRAIN: [ 58  62  66  71  72  73  74  75  76  78  79  80  81  82  84  85  87  88
  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139] TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 59 60 61 63 64 65 67 68 69 70 77 83 86]
Train_dataset...
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [0 2 2 1 0 3 1 0 2 4 0 3 2 1 0 1 4 1 0 1 1 5 6 0 0 5 0 0 1 6 0 6 6 6 3 6 6
 7 5 6 0 6 1 6 1 1 6 0 1 3 4 1 1 6 7 5 1 0 2 7 2 6 4 0 3 6 1 4 1]
Validate_dataset...
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

## 4.4 基于模型的特征提取

tf-idf 提供的原始样本后，然后通过LR。。。等模型提取 特征。  通过多个模型提取特征，进行特征融合，然后再进行训练

In [59]:
num_class = len(np.unique(train_df['penalty']))
num_class

8

In [60]:
# 初始化stack ： 通过模型提取特征矩阵的数值
stack=np.zeros((X_train.shape[0],num_class))
stack_te = np.zeros((X_test.shape[0], num_class))
stack

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [61]:
k=2
score_va = 0
score_te = 0
skf = StratifiedKFold(n_splits=k, random_state=config.seed)
for train_index, val_index in skf.split(X_train, y_train):
    #print("TRAIN:", train_index, "TEST:", test_index)
    train_data,val_data=X_train[train_index],X_train[val_index]
    train_y,val_y=y_train[train_index],y_train[val_index]
    lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    lr.fit(train_data, train_y)
    y_pred_val=lr.predict(val_data)
    val_accuracy = micro_avg_f1(val_y,y_pred_val)
    test_accuacy = micro_avg_f1(y_test,lr.predict(X_test))
    LOGGER.log('val_dataset accuacy:%f' % val_accuracy)
    LOGGER.log('test_dataset accuacy:%f' % test_accuacy)
    score_va += val_accuracy
    score_te += test_accuacy
    print("extract feature based lr.")
    y_pred_val_prob = lr.predict_proba(val_data)
    y_pred_te_prob = lr.predict_proba(X_test)
    stack[val_index]+=y_pred_val_prob
    stack_te+=y_pred_te_prob
score_va /= k
score_te /= k
LOGGER.log('val_dataset avg acc:%f' % score_va)
LOGGER.log('test_dataset avg acc:%f' % score_te)

2019-03-25 17:29:34 val_dataset accuacy:0.450704
2019-03-25 17:29:34 test_dataset accuacy:0.533333
extract feature based lr.
2019-03-25 17:29:34 val_dataset accuacy:0.565217
2019-03-25 17:29:34 test_dataset accuacy:0.650000
extract feature based lr.
2019-03-25 17:29:34 val_dataset avg acc:0.507961
2019-03-25 17:29:34 test_dataset avg acc:0.591667


In [62]:
stack_all = np.vstack([stack/k,stack_te/k])
print(stack_all.shape,stack_all.shape[1])
stack_all[0:4]

(200, 8) 8


array([[0.13563476, 0.13719248, 0.030327  , 0.0253883 , 0.05564552,
        0.02432604, 0.06717233, 0.02431357],
       [0.04900977, 0.12568846, 0.08445302, 0.05948503, 0.03772673,
        0.02368232, 0.10294757, 0.01700711],
       [0.09295997, 0.18761264, 0.02843153, 0.02509351, 0.03274937,
        0.03849071, 0.07668153, 0.01798074],
       [0.04623509, 0.15607241, 0.08523052, 0.07463287, 0.04312961,
        0.01720253, 0.06239845, 0.01509852]])

In [63]:
train_df.shape

(200, 4)

In [64]:
df_stack = pd.DataFrame(index=range(len(train_df)))

In [65]:
for i in range(stack_all.shape[1]):
    df_stack['tfidf_lr_{}'.format(i)] = stack_all[:, i]

In [66]:
df_stack.head(10)

Unnamed: 0,tfidf_lr_0,tfidf_lr_1,tfidf_lr_2,tfidf_lr_3,tfidf_lr_4,tfidf_lr_5,tfidf_lr_6,tfidf_lr_7
0,0.135635,0.137192,0.030327,0.025388,0.055646,0.024326,0.067172,0.024314
1,0.04901,0.125688,0.084453,0.059485,0.037727,0.023682,0.102948,0.017007
2,0.09296,0.187613,0.028432,0.025094,0.032749,0.038491,0.076682,0.017981
3,0.046235,0.156072,0.085231,0.074633,0.04313,0.017203,0.062398,0.015099
4,0.111274,0.111634,0.045155,0.029082,0.037598,0.039935,0.104618,0.020703
5,0.259456,0.070825,0.027218,0.019684,0.024603,0.024704,0.058181,0.015329
6,0.12958,0.149111,0.0305,0.026943,0.032482,0.030487,0.082044,0.018853
7,0.135493,0.122287,0.028288,0.024525,0.038407,0.031539,0.100964,0.018497
8,0.058786,0.253379,0.049045,0.022862,0.025377,0.020135,0.049474,0.020943
9,0.094105,0.071807,0.026306,0.022724,0.033885,0.032913,0.200135,0.018126


In [67]:
# 保持lr-提取的特征
config.feat_tfidf_lr_prob = 'D:/ML_Study/2017-CCF-BDCI-AIJudge/data/output/feature/tfidf/lr_prob_12w.csv'
df_stack.to_csv(config.feat_tfidf_lr_prob, index=None, encoding='utf8')

## 4.4 基于LR模型的特征提取

In [68]:
#调用pickle库的dump方法保存模型，需要2个参数。
#第1个参数是保存的对象，可以为任意数据类型，因为有3个模型需要保存，所以下面代码第1个参数是字典。
#第2个参数是保存的文件对象，数据类型为_io.BufferedWriter

### 5. 特征提取-stack

In [69]:
# 1. 获取类别-个数=新的特征维度
num_class = len(np.unique(train_df['penalty']))
print("num_class={0}".format(num_class))
# 2.初始化stack ： 通过模型提取特征矩阵的数值
print("dataset size X_train={0},X_test={1},dim={2}".format(X_train.shape[0],X_test.shape[0],X_test.shape[1]))
stack=np.zeros((X_train.shape[0],num_class))
stack_te = np.zeros((X_test.shape[0], num_class))

num_class=8
dataset size X_train=140,X_test=60,dim=1343


### 5.1 lr 特征提取

In [70]:
# 1. 交叉验证- 特征得分平均
k=2 
print("cross_validate k={0}".format(k))
score_va = 0
score_te = 0
skf = StratifiedKFold(n_splits=k, random_state=config.seed)
for i,(train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    print("cross_validate.....i={0}".format(i+1))
    train_data,val_data=X_train[train_index],X_train[val_index]
    train_y,val_y=y_train[train_index],y_train[val_index]
    clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    clf.fit(train_data, train_y)
    y_pred_val=clf.predict(val_data)
    val_accuracy = micro_avg_f1(val_y,y_pred_val)
    test_accuacy = micro_avg_f1(y_test,clf.predict(X_test))
    LOGGER.log('val_dataset accuacy:%f' % val_accuracy)
    LOGGER.log('test_dataset accuacy:%f' % test_accuacy)
    score_va += val_accuracy
    score_te += test_accuacy
    y_pred_val_prob = clf.predict_proba(val_data)
    y_pred_te_prob = clf.predict_proba(X_test)
    stack[val_index]+=y_pred_val_prob
    stack_te+=y_pred_te_prob
score_va /= k
score_te /= k
print("cross_validate success. print avg acc ...")
LOGGER.log('val_dataset avg acc:%f' % score_va)
LOGGER.log('test_dataset avg acc:%f' % score_te)
# 2. lr... 提取的特征存储
stack_all = np.vstack([stack/k,stack_te/k])
df_stack = pd.DataFrame(index=range(len(train_df)))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_lr_{}'.format(i)] = stack_all[:, i]

config.feat_tfidf_lr_prob = 'D:/ML_Study/2017-CCF-BDCI-AIJudge/data/output/feature/tfidf/lr_prob_12w.csv'
df_stack.to_csv(config.feat_tfidf_lr_prob, index=None, encoding='utf8')
df_stack.head()

cross_validate k=2
cross_validate.....i=1
2019-03-25 17:29:41 val_dataset accuacy:0.450704
2019-03-25 17:29:41 test_dataset accuacy:0.533333
cross_validate.....i=2
2019-03-25 17:29:41 val_dataset accuacy:0.565217
2019-03-25 17:29:41 test_dataset accuacy:0.650000
cross_validate success. print avg acc ...
2019-03-25 17:29:41 val_dataset avg acc:0.507961
2019-03-25 17:29:41 test_dataset avg acc:0.591667


Unnamed: 0,tfidf_lr_0,tfidf_lr_1,tfidf_lr_2,tfidf_lr_3,tfidf_lr_4,tfidf_lr_5,tfidf_lr_6,tfidf_lr_7
0,0.135635,0.137192,0.030327,0.025388,0.055646,0.024326,0.067172,0.024314
1,0.04901,0.125688,0.084453,0.059485,0.037727,0.023682,0.102948,0.017007
2,0.09296,0.187613,0.028432,0.025094,0.032749,0.038491,0.076682,0.017981
3,0.046235,0.156072,0.085231,0.074633,0.04313,0.017203,0.062398,0.015099
4,0.111274,0.111634,0.045155,0.029082,0.037598,0.039935,0.104618,0.020703


### 5.2 BernoulliNB 特征提取

In [71]:
# 1. 交叉验证- 特征得分平均
k=5 
print("cross_validate k={0}".format(k))
score_va = 0
score_te = 0
skf = StratifiedKFold(n_splits=k, random_state=config.seed)
for i,(train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    print("cross_validate.....i={0}".format(i+1))
    train_data,val_data=X_train[train_index],X_train[val_index]
    train_y,val_y=y_train[train_index],y_train[val_index]
    clf = BernoulliNB()
    clf.fit(train_data, train_y)
    y_pred_val=clf.predict(val_data)
    val_accuracy = micro_avg_f1(val_y,y_pred_val)
    test_accuacy = micro_avg_f1(y_test,clf.predict(X_test))
    LOGGER.log('val_dataset accuacy:%f' % val_accuracy)
    LOGGER.log('test_dataset accuacy:%f' % test_accuacy)
    score_va += val_accuracy
    score_te += test_accuacy
    y_pred_val_prob = clf.predict_proba(val_data)
    y_pred_te_prob = clf.predict_proba(X_test)
    stack[val_index]+=y_pred_val_prob
    stack_te+=y_pred_te_prob
score_va /= k
score_te /= k
print("cross_validate success. print avg acc ...")
LOGGER.log('val_dataset avg acc:%f' % score_va)
LOGGER.log('test_dataset avg acc:%f' % score_te)
# 2. lr... 提取的特征存储
stack_all = np.vstack([stack/k,stack_te/k])
df_stack = pd.DataFrame(index=range(len(train_df)))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_bnb_{}'.format(i)] = stack_all[:, i]

config.feat_tfidf_gnb_prob = 'D:/ML_Study/2017-CCF-BDCI-AIJudge/data/output/feature/tfidf/bnb_prob_12w.csv'
df_stack.to_csv(config.feat_tfidf_bnb_prob, index=None, encoding='utf8')
df_stack.head()

cross_validate k=5
cross_validate.....i=1
2019-03-25 17:29:58 val_dataset accuacy:0.466667
2019-03-25 17:29:58 test_dataset accuacy:0.600000
cross_validate.....i=2
2019-03-25 17:29:58 val_dataset accuacy:0.482759
2019-03-25 17:29:58 test_dataset accuacy:0.600000
cross_validate.....i=3
2019-03-25 17:29:58 val_dataset accuacy:0.535714
2019-03-25 17:29:58 test_dataset accuacy:0.566667
cross_validate.....i=4
2019-03-25 17:29:58 val_dataset accuacy:0.607143
2019-03-25 17:29:58 test_dataset accuacy:0.616667
cross_validate.....i=5
2019-03-25 17:29:58 val_dataset accuacy:0.520000
2019-03-25 17:29:58 test_dataset accuacy:0.550000
cross_validate success. print avg acc ...
2019-03-25 17:29:58 val_dataset avg acc:0.522456
2019-03-25 17:29:58 test_dataset avg acc:0.586667


AttributeError: 'Config' object has no attribute 'feat_tfidf_bnb_prob'

## 5.3  MultinomialNB 特征提取

In [181]:
# 1. 交叉验证- 特征得分平均
k=2 
print("cross_validate k={0}".format(k))
score_va = 0
score_te = 0
skf = StratifiedKFold(n_splits=k, random_state=config.seed)
for i,(train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    print("cross_validate.....i={0}".format(i+1))
    train_data,val_data=X_train[train_index],X_train[val_index]
    train_y,val_y=y_train[train_index],y_train[val_index]
    clf = MultinomialNB()
    clf.fit(train_data, train_y)
    y_pred_val=clf.predict(val_data)
    val_accuracy = micro_avg_f1(val_y,y_pred_val)
    test_accuacy = micro_avg_f1(y_test,clf.predict(X_test))
    LOGGER.log('val_dataset accuacy:%f' % val_accuracy)
    LOGGER.log('test_dataset accuacy:%f' % test_accuacy)
    score_va += val_accuracy
    score_te += test_accuacy
    y_pred_val_prob = clf.predict_proba(val_data)
    y_pred_te_prob = clf.predict_proba(X_test)
    stack[val_index]+=y_pred_val_prob
    stack_te+=y_pred_te_prob
score_va /= k
score_te /= k
print("cross_validate success. print avg acc ...")
LOGGER.log('val_dataset avg acc:%f' % score_va)
LOGGER.log('test_dataset avg acc:%f' % score_te)
# 2. lr... 提取的特征存储
stack_all = np.vstack([stack/k,stack_te/k])
df_stack = pd.DataFrame(index=range(len(train_df)))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_mnb_{}'.format(i)] = stack_all[:, i]

config.feat_tfidf_mnb_prob = 'D:/ML_Study/2017-CCF-BDCI-AIJudge/data/output/feature/tfidf/mnb_prob_12w.csv'
df_stack.to_csv(config.feat_tfidf_mnb_prob, index=None, encoding='utf8')
df_stack.head()

cross_validate k=2
cross_validate.....i=1
2019-03-25 16:00:43 val_dataset accuacy:0.630137
2019-03-25 16:00:43 test_dataset accuacy:0.416667
cross_validate.....i=2
2019-03-25 16:00:43 val_dataset accuacy:0.597015
2019-03-25 16:00:43 test_dataset accuacy:0.383333
cross_validate success. print avg acc ...
2019-03-25 16:00:43 val_dataset avg acc:0.613576
2019-03-25 16:00:43 test_dataset avg acc:0.400000


Unnamed: 0,tfidf_mnb_0,tfidf_mnb_1,tfidf_mnb_2,tfidf_mnb_3,tfidf_mnb_4,tfidf_mnb_5,tfidf_mnb_6,tfidf_mnb_7
0,4.661676,0.686872,0.199022,0.098458,0.100374,0.066655,0.585861,0.101082
1,0.934575,2.129631,0.260996,0.183068,0.144609,0.098875,2.613859,0.134388
2,2.613455,1.747695,0.205373,0.136668,0.121855,0.097442,1.461603,0.11591
3,0.454932,0.75542,0.310109,0.129176,0.135088,0.08756,4.499,0.128714
4,0.414802,4.042746,0.625174,0.098356,0.086652,0.058404,1.089347,0.084519
