# 基于tf－idf  LR/BernoulliNB/MultinomialNB 算法抽取特征 训练

* tf-idf 方式文章-> 高纬稀疏向量 
* 对高纬稀疏向量进行降维，使用LR/BernoulliNB/MultinomialNB stack方式抽取特征
* 针对组合特征，使用xgboost进行多分类训练

In [1]:
import sys

sys.path.append("/Users/zhengwenjie/AI/work/ML_3/2017-CCF-BDCI-AIJudge")

import numpy as np
import pandas as pd
from gensim.models import Doc2Vec
from utils import LOGGER
from config.db_config import Config

import warnings

warnings.filterwarnings('ignore')
config = Config()

## 1. 加载特征&标签

### 1.1 加载特征

In [2]:
df_tfidf_lr = pd.read_csv(config.feat_tfidf_lr_prob)
df_tfidf_bnb = pd.read_csv(config.feat_tfidf_bnb_prob)
df_tfidf_mnb = pd.read_csv(config.feat_tfidf_mnb_prob)
print(df_tfidf_lr.shape,df_tfidf_bnb.shape,df_tfidf_mnb.shape)

(100, 8) (100, 8) (100, 8)


In [3]:
df_tfidf_lr.head(2)

Unnamed: 0,tfidf_lr_0,tfidf_lr_1,tfidf_lr_2,tfidf_lr_3,tfidf_lr_4,tfidf_lr_5,tfidf_lr_6,tfidf_lr_7
0,0.039598,0.036783,0.014341,0.003581,0.014108,0.010981,0.069892,0.010715
1,0.026864,0.02907,0.010598,0.002971,0.012282,0.00635,0.104891,0.006972


In [4]:
df_tfidf_bnb.head(2)

Unnamed: 0,tfidf_bnb_0,tfidf_bnb_1,tfidf_bnb_2,tfidf_bnb_3,tfidf_bnb_4,tfidf_bnb_5,tfidf_bnb_6,tfidf_bnb_7
0,3.798501e-18,2.231955e-21,2.697699e-33,9.408594e-79,1.874799e-36,1.055956e-37,0.2,3.589278e-47
1,5.750392e-22,9.352165e-21,2.369561e-37,5.416003e-83,3.29351e-40,2.992593e-56,0.2,1.064033e-55


In [5]:
df_tfidf_mnb.head(2)

Unnamed: 0,tfidf_mnb_0,tfidf_mnb_1,tfidf_mnb_2,tfidf_mnb_3,tfidf_mnb_4,tfidf_mnb_5,tfidf_mnb_6,tfidf_mnb_7
0,0.038015,0.037619,0.011851,0.002589,0.010582,0.007695,0.084521,0.007129
1,0.02601,0.029862,0.010599,0.002772,0.010975,0.005744,0.108052,0.005986


### 1.2 加载标签

In [24]:
df_data = pd.read_csv(config.data_csv_path)
rows = df_data.shape[0]
print(rows)
print(df_data.shape)
print(np.unique(df_data['penalty']))
# 标签转为［0,8）  
df_data['penalty'] = df_data['penalty'] - 1
print(np.unique(df_data['penalty']))
df_data.head()

100
(100, 4)
[1 2 3 4 5 6 7 8]
[0 1 2 3 4 5 6 7]


Unnamed: 0,id,content,laws,penalty
0,16,公诉 机关 霍邱县 人民检察院 被告人 许某 1975 日生 2012 因涉嫌 危险 驾驶 ...,1337273,2
1,32,公诉 机关 海口市 龙华区 人民检察院 被告人 王某 海口市 龙华区 人民检察院 海龙 检公...,347675264,0
2,41,公诉 机关 广东省 潮州市 人民检察院 被告人 覃学彬 1980 出生 广西壮族自治区 大新...,2632552535556,4
3,57,公诉 机关 榆林市 榆阳区 人民检察院 上诉人 原审 被告人 2012 因涉嫌 盗窃罪 榆林...,2645253677273,4
4,60,公诉 机关 榆阳区 人民检察院 上诉人 原审 被告人 刘某 汉族 陕西省 横山县 小学文化 ...,2242526275272,6


### 1.3 .  特征组合

In [23]:
df_feat = pd.concat([df_tfidf_lr,df_tfidf_bnb,df_tfidf_mnb],axis=1)
print(df_feat.shape)
df_feat.head()

(100, 24)


Unnamed: 0,tfidf_lr_0,tfidf_lr_1,tfidf_lr_2,tfidf_lr_3,tfidf_lr_4,tfidf_lr_5,tfidf_lr_6,tfidf_lr_7,tfidf_bnb_0,tfidf_bnb_1,...,tfidf_bnb_6,tfidf_bnb_7,tfidf_mnb_0,tfidf_mnb_1,tfidf_mnb_2,tfidf_mnb_3,tfidf_mnb_4,tfidf_mnb_5,tfidf_mnb_6,tfidf_mnb_7
0,0.039598,0.036783,0.014341,0.003581,0.014108,0.010981,0.069892,0.010715,3.798501e-18,2.231955e-21,...,0.2,3.589278e-47,0.038015,0.037619,0.011851,0.002589,0.010582,0.007695,0.084521,0.007129
1,0.026864,0.02907,0.010598,0.002971,0.012282,0.00635,0.104891,0.006972,5.750392e-22,9.352165e-21,...,0.2,1.064033e-55,0.02601,0.029862,0.010599,0.002772,0.010975,0.005744,0.108052,0.005986
2,0.026383,0.053365,0.015289,0.005954,0.017155,0.007126,0.067509,0.007219,2.29175e-17,2.035139e-10,...,0.2,8.214956000000001e-67,0.022397,0.075922,0.010636,0.002758,0.010284,0.003593,0.070938,0.003472
3,0.020936,0.119039,0.018465,0.002682,0.009245,0.005295,0.018011,0.006327,4.229793e-39,0.2,...,9.99252e-53,1.923538e-96,0.011274,0.163518,0.008235,0.000863,0.00344,0.001747,0.008948,0.001975
4,0.044806,0.063714,0.012909,0.003594,0.014628,0.00834,0.04399,0.008018,0.1971429,0.002857002,...,7.897082e-08,3.5474539999999997e-57,0.044297,0.070741,0.012572,0.00323,0.012673,0.007278,0.042619,0.00659


## 2. 切分训练集和测试集数据

In [25]:
splits_size = int(rows * 0.7)
splits_size

70

In [None]:
X,y = 

## 4. 模型训练