# 使用`CountVectorizer`简单的词向量化
## 选择`min_df = 0.000001`去掉出现频率2篇以下的词汇
## 选择`max_df = 0.4`去掉高频无用词

In [1]:
import pandas as pd
import os
import pickle

# 加载一个存好的pickles分词数组
def openDataFrame(path):
    with open(path, 'rb') as f:
        genList = pickle.load(f)
        dbFrame = pd.DataFrame(genList, columns=['index', 'content', 'tag'])
    return dbFrame

trainFrame = openDataFrame('DB/trainDB/words.pkl')
testFrame = openDataFrame('DB/testDB/words.pkl')
print(trainFrame.info())
print(testFrame.info())

mergedFrame = pd.concat([trainFrame, testFrame])
mergedFrame.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549972 entries, 0 to 549971
Data columns (total 3 columns):
index      549972 non-null object
content    549972 non-null object
tag        549972 non-null object
dtypes: object(3)
memory usage: 12.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739840 entries, 0 to 739839
Data columns (total 3 columns):
index      739840 non-null object
content    739840 non-null object
tag        739840 non-null object
dtypes: object(3)
memory usage: 16.9+ MB
None


Unnamed: 0,index,content,tag
0,1,"[标题, 月薪, 年头, 金融, 学历, 留学生, 月薪, 话题, 留学生, 高校, 毕业生...",教育
1,2,"[综合报道, 澳大利亚, 节目, 事业, SBS, 报告, 英国, 政府, 外国, 留学生,...",教育
2,3,"[中国, 行业, 论坛, 素质, 行业, 论坛, 总部, 大厦, 论坛, 主题, 行业, 素...",教育
3,4,"[桃李, 芬芳, 书院, 时期, 潍坊, 一中, 时代, 面貌, 部长, 教育部, 党组, ...",教育
4,5,"[渠道, ETS, 世贸, 媒体, 见面会, 媒体, 老师, 中国, 现状, 观点, ETS...",教育


In [2]:
# 得到处理的词和数字化标签

label_map = {'财经':0, '股票':1, '教育':2, '科技':3, '时政':4,'体育':5,'游戏':6,'娱乐':7,'汽车':8,'社会':9,'军事':10}
mergedFrame['y_target'] = mergedFrame['tag'].map(label_map)
y_target = mergedFrame.y_target.tolist()
mergeList = [' '.join(x) for x in mergedFrame.content.tolist()]


In [5]:
mergedFrame.to_csv('DB/raw/words.csv', encoding='utf-8')

In [6]:
mergedFrame.shape

(1289812, 4)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2, SelectKBest

vecMethod = CountVectorizer(min_df=0.000001, max_df=0.4)
Matrix = vecMethod.fit_transform(mergeList)
print(Matrix.shape)

(1289812, 469138)


In [10]:
relabel_map = {0:'财经', 1:'股票', 2:'教育', 3:'科技', 4:'时政', 5:'体育', 6:'游戏', 7:'娱乐', 8:'汽车', 9:'社会', 10:'军事'}
relabel_map

{0: '财经',
 1: '股票',
 2: '教育',
 3: '科技',
 4: '时政',
 5: '体育',
 6: '游戏',
 7: '娱乐',
 8: '汽车',
 9: '社会',
 10: '军事'}

In [12]:
from sklearn.datasets.base import Bunch
bunchObj = Bunch(matrix = Matrix, y_target = y_target, vocabulary = vecMethod.vocabulary_)
with open('DB/matrix/BOW.pkl', 'xb') as f:
    pickle.dump(bunchObj, f)