In [1]:
import sys

sys.path.append("D:/ML_Study/2017-CCF-BDCI-AIJudge")
import pandas as pd
from config.db_config import Config
import numpy as np
from utils import LOGGER

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [2]:

def micro_avg_f1(y_true, y_pred):
    '''
    分类 评估函数
    :param y_true: 样本实际类别
    :param y_pred: 预测类别
    :return:
    '''
    return f1_score(y_true, y_pred, average='micro')


def load_data(data_path):
    '''
    加载数据
    :param data_path:
    :return:
    '''
    train_df = pd.read_csv(data_path, encoding='utf-8', sep=',')
    train_df = train_df.dropna()
    train_df['penalty'] = train_df['penalty'].astype(int)
    return train_df


def get_words_list(raw_documents):
    '''
        获取tfidf 数值化的输入数据
    :return: 数据类型为列表，其中的元素也为列表  [[word1 word2.....],[word11 word12....]]
    '''
    words_list = []
    for document in raw_documents:
        words_list.append(document.split())
    return words_list


def words_list2tfidf_feature(raw_documents):
    words_list = get_words_list(raw_documents)
    ## 1. TfidfVectorizer模型
    '''
    调用sklearn.feature_extraction.text库的TfidfVectorizer方法实例化模型对象。
    TfidfVectorizer方法需要4个参数。
    第1个参数是分词结果，数据类型为列表，其中的元素也为列表；
    第2个关键字参数stop_words是停顿词，数据类型为列表；
    第3个关键字参数min_df是词频低于此值则忽略，数据类型为int或float;
    第4个关键字参数max_df是词频高于此值则忽略，数据类型为Int或float。
    查看TfidfVectorizer方法的更多参数用法，
    官方文档链接：http://sklearn.apachecn.org/cn/0.19.0/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    '''
    tfidf = TfidfVectorizer(words_list, min_df=3, max_df=0.95)
    '''
    第1行代码查看向量化的维数，即特征的维数；
    第2行代码调用TfidfVectorizer对象的fit_transform方法获得特征矩阵赋值给X；
    第3行代码查看特征矩阵的形状。
    '''
    X = tfidf.fit_transform(raw_documents)
    print('词表大小:', len(tfidf.vocabulary_))
    print(X.shape)
    return X


def df_target2label(target):
    '''
    调用sklearn.preprocessing库的LabelEncoder方法对文章分类做标签编码。
    :param target:
    :return:
    '''
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(target)
    return y


In [3]:
## 2. lr 特征提取
def tfidf_clf_features(clf, X_train, X_test, y_train, y_test, num_class, seed=2019):
    # 初始化stack ： 通过模型提取特征矩阵的数值
    print("dataset size X_train={0},X_test={1},dim={2}".format(X_train.shape[0], X_test.shape[0], X_test.shape[1]))
    stack = np.zeros((X_train.shape[0], num_class))
    stack_te = np.zeros((X_test.shape[0], num_class))
    # 1. 交叉验证- 特征得分平均
    k = 5
    print("cross_validate k={0}".format(k))
    score_va = 0
    score_te = 0
    skf = StratifiedKFold(n_splits=k, random_state=seed)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        print("cross_validate.....i={0}".format(i + 1))
        train_data, val_data = X_train[train_index], X_train[val_index]
        train_y, val_y = y_train[train_index], y_train[val_index]
        # clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
        clf.fit(train_data, train_y)
        y_pred_val = clf.predict(val_data)
        val_accuracy = micro_avg_f1(val_y, y_pred_val)
        test_accuacy = micro_avg_f1(y_test, clf.predict(X_test))
        LOGGER.log('val_dataset accuacy:%f' % val_accuracy)
        LOGGER.log('test_dataset accuacy:%f' % test_accuacy)
        score_va += val_accuracy
        score_te += test_accuacy
        y_pred_val_prob = clf.predict_proba(val_data)
        y_pred_te_prob = clf.predict_proba(X_test)
        stack[val_index] += y_pred_val_prob
        stack_te += y_pred_te_prob
    score_va /= k
    score_te /= k
    print("cross_validate success. print avg acc ...")
    LOGGER.log('val_dataset avg acc:%f' % score_va)
    LOGGER.log('test_dataset avg acc:%f' % score_te)
    # 2. lr... 提取的特征存储
    stack_all = np.vstack([stack / k, stack_te / k])
    return stack_all

In [7]:
config = Config()
# 1. 提取tfidf 格式特征 X,y
train_df = load_data(config.data_csv_path)
count = len(train_df)
print("data_size={0}".format(count))
raw_documents = train_df['content']
X = words_list2tfidf_feature(raw_documents)
y = df_target2label(train_df['penalty'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
num_class = len(np.unique(y))
print("num_class={0}".format(num_class))

data_size=200
词表大小: 1343
(200, 1343)
num_class=8


In [9]:
# 2. 提取特征
# 2.1 lr 提取特征
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
stack_all = tfidf_clf_features(clf, X_train, X_test, y_train, y_test, num_class)
df_stack = pd.DataFrame(index=range(count))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_lr_{}'.format(i)] = stack_all[:, i]
df_stack.to_csv(config.feat_tfidf_lr_prob, index=None, encoding='utf8')
df_stack.head()

dataset size X_train=140,X_test=60,dim=1343
cross_validate k=5
cross_validate.....i=1
2019-03-25 17:33:15 val_dataset accuacy:0.516129
2019-03-25 17:33:15 test_dataset accuacy:0.583333
cross_validate.....i=2
2019-03-25 17:33:15 val_dataset accuacy:0.533333
2019-03-25 17:33:15 test_dataset accuacy:0.600000
cross_validate.....i=3
2019-03-25 17:33:15 val_dataset accuacy:0.518519
2019-03-25 17:33:15 test_dataset accuacy:0.616667
cross_validate.....i=4
2019-03-25 17:33:15 val_dataset accuacy:0.615385
2019-03-25 17:33:15 test_dataset accuacy:0.633333
cross_validate.....i=5
2019-03-25 17:33:15 val_dataset accuacy:0.576923
2019-03-25 17:33:15 test_dataset accuacy:0.616667
cross_validate success. print avg acc ...
2019-03-25 17:33:15 val_dataset avg acc:0.552058
2019-03-25 17:33:15 test_dataset avg acc:0.610000


Unnamed: 0,tfidf_lr_0,tfidf_lr_1,tfidf_lr_2,tfidf_lr_3,tfidf_lr_4,tfidf_lr_5,tfidf_lr_6,tfidf_lr_7
0,0.01197,0.125226,0.027079,0.005949,0.008986,0.005228,0.010474,0.005089
1,0.014266,0.059491,0.025005,0.047042,0.0125,0.006166,0.030519,0.005011
2,0.108593,0.032227,0.009247,0.006541,0.009368,0.008013,0.020794,0.005217
3,0.031528,0.032429,0.009179,0.01329,0.015672,0.015025,0.07579,0.007087
4,0.028078,0.085904,0.012173,0.008916,0.014038,0.009866,0.033342,0.007683


In [10]:
 # 2.2 BernoulliNB 提取特征
clf = BernoulliNB()
stack_all = tfidf_clf_features(clf, X_train, X_test, y_train, y_test, num_class)
df_stack = pd.DataFrame(index=range(count))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_bnb_{}'.format(i)] = stack_all[:, i]
df_stack.to_csv(config.feat_tfidf_bnb_prob, index=None, encoding='utf8')
df_stack.head()

dataset size X_train=140,X_test=60,dim=1343
cross_validate k=5
cross_validate.....i=1
2019-03-25 17:33:31 val_dataset accuacy:0.516129
2019-03-25 17:33:31 test_dataset accuacy:0.583333
cross_validate.....i=2
2019-03-25 17:33:31 val_dataset accuacy:0.500000
2019-03-25 17:33:31 test_dataset accuacy:0.583333
cross_validate.....i=3
2019-03-25 17:33:31 val_dataset accuacy:0.518519
2019-03-25 17:33:31 test_dataset accuacy:0.583333
cross_validate.....i=4
2019-03-25 17:33:31 val_dataset accuacy:0.615385
2019-03-25 17:33:31 test_dataset accuacy:0.650000
cross_validate.....i=5
2019-03-25 17:33:31 val_dataset accuacy:0.538462
2019-03-25 17:33:31 test_dataset accuacy:0.616667
cross_validate success. print avg acc ...
2019-03-25 17:33:31 val_dataset avg acc:0.537699
2019-03-25 17:33:31 test_dataset avg acc:0.603333


Unnamed: 0,tfidf_bnb_0,tfidf_bnb_1,tfidf_bnb_2,tfidf_bnb_3,tfidf_bnb_4,tfidf_bnb_5,tfidf_bnb_6,tfidf_bnb_7
0,3.9471409999999997e-50,0.2,1.4203210000000001e-33,4.158327e-67,4.797524e-79,4.132211e-97,2.054234e-78,3.374777e-128
1,2.7928149999999998e-30,0.2,2.6371160000000002e-27,3.539774e-15,5.040736e-61,1.119635e-79,4.309427e-45,4.5160709999999994e-111
2,0.2,1.586325e-33,1.9445570000000001e-53,5.119947e-66,9.064126e-68,5.4611e-74,7.675490000000001e-55,1.118855e-111
3,2.525483e-07,0.04104715,3.8925520000000003e-31,2.5291040000000003e-27,3.2237e-33,1.563185e-38,0.1589526,1.318779e-76
4,6.404584e-15,0.2,1.346761e-33,5.517362e-42,3.0227329999999997e-38,5.110156999999999e-50,3.0752110000000003e-23,4.450799e-89


In [11]:
# 2.3 BernoulliNB 提取特征
clf = MultinomialNB()
stack_all = tfidf_clf_features(clf, X_train, X_test, y_train, y_test, num_class)
df_stack = pd.DataFrame(index=range(count))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_mnb_{}'.format(i)] = stack_all[:, i]
df_stack.to_csv(config.feat_tfidf_mnb_prob, index=None, encoding='utf8')
df_stack.head()

dataset size X_train=140,X_test=60,dim=1343
cross_validate k=5
cross_validate.....i=1
2019-03-25 17:33:46 val_dataset accuacy:0.483871
2019-03-25 17:33:46 test_dataset accuacy:0.566667
cross_validate.....i=2
2019-03-25 17:33:46 val_dataset accuacy:0.533333
2019-03-25 17:33:46 test_dataset accuacy:0.583333
cross_validate.....i=3
2019-03-25 17:33:46 val_dataset accuacy:0.481481
2019-03-25 17:33:46 test_dataset accuacy:0.600000
cross_validate.....i=4
2019-03-25 17:33:46 val_dataset accuacy:0.538462
2019-03-25 17:33:46 test_dataset accuacy:0.566667
cross_validate.....i=5
2019-03-25 17:33:46 val_dataset accuacy:0.538462
2019-03-25 17:33:46 test_dataset accuacy:0.583333
cross_validate success. print avg acc ...
2019-03-25 17:33:46 val_dataset avg acc:0.515122
2019-03-25 17:33:46 test_dataset avg acc:0.580000


Unnamed: 0,tfidf_mnb_0,tfidf_mnb_1,tfidf_mnb_2,tfidf_mnb_3,tfidf_mnb_4,tfidf_mnb_5,tfidf_mnb_6,tfidf_mnb_7
0,0.003829,0.180469,0.008034,0.001521,0.00185,0.000902,0.002686,0.00071
1,0.008375,0.123705,0.012833,0.02448,0.004474,0.001678,0.023398,0.001057
2,0.135864,0.029412,0.005488,0.004134,0.004834,0.004239,0.013828,0.0022
3,0.037439,0.049738,0.005405,0.008839,0.008999,0.007237,0.079697,0.002646
4,0.029211,0.098473,0.010282,0.008347,0.010358,0.00732,0.031354,0.004655
