## 调用`sklearn.feature_selection.SelectKBest`使用卡方检验的方法进行数据预处理

### 维度从47W降到50000，最终sklearn得到的结果有所提升

### 问题：sklearn的SelectKBest方法不存在词典库，因此需要将测试集和训练集整合在一起，然后再分开，训练集比例为0.3，与baseline相比，大小不超过5w

In [1]:
import pandas as pd
import os
import pickle

# 加载一个存好的pickles分词数组
def openDataFrame(path):
    with open(path, 'rb') as f:
        genList = pickle.load(f)
        dbFrame = pd.DataFrame(genList, columns=['index', 'content', 'tag'])
    return dbFrame

trainFrame = openDataFrame('DB/trainDB/words.pkl')
testFrame = openDataFrame('DB/testDB/words.pkl')
print(trainFrame.info())
print(testFrame.info())

mergedFrame = pd.concat([trainFrame, testFrame])
mergedFrame.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549972 entries, 0 to 549971
Data columns (total 3 columns):
index      549972 non-null object
content    549972 non-null object
tag        549972 non-null object
dtypes: object(3)
memory usage: 12.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739840 entries, 0 to 739839
Data columns (total 3 columns):
index      739840 non-null object
content    739840 non-null object
tag        739840 non-null object
dtypes: object(3)
memory usage: 16.9+ MB
None


Unnamed: 0,index,content,tag
0,1,"[标题, 月薪, 年头, 金融, 学历, 留学生, 月薪, 话题, 留学生, 高校, 毕业生...",教育
1,2,"[综合报道, 澳大利亚, 节目, 事业, SBS, 报告, 英国, 政府, 外国, 留学生,...",教育
2,3,"[中国, 行业, 论坛, 素质, 行业, 论坛, 总部, 大厦, 论坛, 主题, 行业, 素...",教育
3,4,"[桃李, 芬芳, 书院, 时期, 潍坊, 一中, 时代, 面貌, 部长, 教育部, 党组, ...",教育
4,5,"[渠道, ETS, 世贸, 媒体, 见面会, 媒体, 老师, 中国, 现状, 观点, ETS...",教育


In [3]:
# 得到处理的词和数字化标签

label_map = {'财经':0, '股票':1, '教育':2, '科技':3, '时政':4,'体育':5,'游戏':6,'娱乐':7,'汽车':8,'社会':9,'军事':10}
mergedFrame['y_target'] = mergedFrame['tag'].map(label_map)
y_target = mergedFrame.y_target.tolist()
mergeList = [' '.join(x) for x in mergedFrame.content.tolist()]


In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2, SelectKBest

vecMethod = CountVectorizer(min_df=0.000001, max_df=0.4)
Matrix = vecMethod.fit_transform(mergeList)
print('train shape:', Matrix.shape)

train shape: (1289812, 469138)


In [45]:
Matrix.shape

(1289812, 469138)

In [46]:
relabel_map = {0:'财经', 1:'股票', 2:'教育', 3:'科技', 4:'时政', 5:'体育', 6:'游戏', 7:'娱乐', 8:'汽车', 9:'社会', 10:'军事'}
relabel_map

{0: '财经',
 1: '股票',
 2: '教育',
 3: '科技',
 4: '时政',
 5: '体育',
 6: '游戏',
 7: '娱乐',
 8: '汽车',
 9: '社会',
 10: '军事'}

In [47]:
def featureSelection(matrix, y_target):
    from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
    from sklearn.feature_selection import SelectKBest, chi2
    maxAbsMethod = MaxAbsScaler()
    X = maxAbsMethod.fit_transform(matrix)
    KbestMethod = SelectKBest(chi2, k=50000)
    reduce = KbestMethod.fit_transform(X, y_target)
    print(reduce.shape)
    return reduce

Matrix = featureSelection(Matrix, y_target)

(1289812, 50000)


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix

trainMatrix, testMatrix, trainTarget, testTarget = train_test_split(Matrix, y_target, train_size = 0.3, stratify = y_target)

print(trainMatrix.shape, testMatrix.shape, len(trainTarget), len(testTarget))

In [49]:
# 注意alpha参数的设置是收敛阈值，小于alpha的时候，就停止训练
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
nbMethod = MultinomialNB(alpha=0.1)
predictMachine = nbMethod.fit(trainMatrix, trainTarget)

In [51]:
from sklearn.datasets.base import Bunch
bunchObj = Bunch(matrix = trainMatrix, y_target = trainTarget)
with open('DB/trainDB/matrix/KCSV.pkl', 'xb') as f:
    pickle.dump(bunchObj, f)

In [52]:
with open('result/SK_NBmethod.pkl', 'xb') as f:
    pickle.dump(predictMachine, f)
    f.close()

In [53]:
from sklearn.metrics import confusion_matrix
import time
predict = predictMachine.predict(testMatrix)
predict

array([ 3,  4,  3, ...,  9, 10,  9])

In [59]:
with open('DB/testDB/matrix/KCSV.pkl', 'xb') as f:
    bunchObj = Bunch(matrix = testMatrix, target = testTarget)
    pickle.dump(bunchObj, f)
    f.close()

In [62]:
with open('result/SK_tgt_pre.pkl', 'xb') as f:
    pickle.dump((testTarget, predict), f)
    f.close()