In [18]:
import os
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
# 停用词列表
stopwords = []

# 加载停用词列表
with open('../stopwords/cn_stopwords.txt', encoding='utf-8') as f:
    for line in f:
        stopwords.append(line.strip())

with open('../stopwords/hit_stopwords.txt', encoding='utf-8') as f:
    for line in f:
        stopwords.append(line.strip())

with open('../stopwords/baidu_stopwords.txt', encoding='utf-8') as f:
    for line in f:
        stopwords.append(line.strip())

with open('../stopwords/scu_stopwords.txt', encoding='utf-8') as f:
    for line in f:
        stopwords.append(line.strip())

##### 对中文分词和虚词过滤的简单测试

In [3]:
def preprocess(text):
    # 分词
    words = jieba.lcut(text)
    # 去除虚词
    words = [word for word in words if word not in stopwords]
    # 返回处理后的词语列表
    return words

text = '这书是我昨天在杭电图书馆借的的。'
words = preprocess(text)
print(words)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.808 seconds.
Prefix dict has been built successfully.


['这书', '昨天', '杭电', '图书馆']


中文分词和虚词过滤的顺序是有一定影响的。通常情况下，应该先进行中文分词，再进行虚词过滤。原因如下：
首先，如果我们先进行虚词过滤，将虚词删除后再进行中文分词，可能会导致一些实质意义的词语被拆分开来，进而影响分析的准确性。

In [5]:
# 读取train数据集
data = []
labels = []
for root, dirs, files in os.walk('./Chinese_Text_Classification_Task_Dataset/train'):
    print('dealing with {}'.format(root.split('/')[-1]))
    for file in files:
        with open(os.path.join(root, file), 'r', encoding='GB2312',errors='ignore') as f:
            content = f.read()
            # 中文分词
            words = jieba.lcut(content)
            # 过滤停用词和数字
            words = [word for word in words if word not in stopwords and not word.isdigit()]

            data.append(' '.join(words))
            labels.append(root.split('/')[-1])

dealing with train
dealing with C19-Computer
dealing with C3-Art
dealing with C39-Sports
dealing with C31-Enviornment
dealing with C38-Politics
dealing with C34-Economy


In [6]:
len(data)

1200

In [23]:
# 使用TF-IDF算法将文本转换成向量
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)
y = labels

['C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer',
 'C19-Computer

In [8]:
selector = SelectKBest(chi2, k=1000)  # 选择1000个特征
selector.fit(X, y)
X = selector.transform(X)

In [9]:
# 计算特征权重
features = vectorizer.get_feature_names_out() 
feature_weights = selector.scores_
sorted_idx = feature_weights.argsort()[::-1][:1000]
for idx in sorted_idx:
    print(features[idx], feature_weights[idx])

政治 232.18988696511764
体育 171.50252478876223
艺术 152.14737372724562
经济 111.86045413700113
民主 67.7423807227464
运动员 59.81317596191297
文艺 57.73554666621497
土壤 55.42793106987776
干部 46.73408747440439
社会主义 45.90343055805633
文学 44.441673746553356
作品 41.60925430683575
训练 40.957317911242434
企业 40.77487805968941
运动 40.74484369375309
知识经济 40.69489579010183
增长 36.66636339978162
投资 36.657844935363165
全球化 36.530872101813344
算法 35.241281975648924
创作 31.581492626507906
数据库 31.53248020086708
agent 30.628798395771543
奥运会 30.096371342512917
比赛 29.528253119897656
小说 29.429348587832717
健身 28.665530032812786
竞技 28.645102508433187
数据 27.69283225804056
服务器 27.365403149939773
权力 27.255297961674426
电影 27.241404269392437
领导 27.15920420999553
建设 26.75544965994526
审美 25.639874328671883
社会 25.046767072049978
用户 24.922787064855417
系统 24.517096878494836
发展 23.81508808467983
学生 23.197911804650168
人民 22.9928288222716
邓小平 22.85024997825185
浓度 22.735867477948062
环境 22.599375192271506
制度 22.46495487460147
艺术家 22.29009476118

In [16]:
# 构建分类器
svm_clf = SVC(kernel='linear')
svm_clf.fit(X, y)

In [19]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X, y)

In [20]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X, y)

##### 模型测试

In [21]:
# 对新的文本进行分类
test_text = '创作艺术需要社会的包容'
words = preprocess(test_text)
print(words)
X_new = vectorizer.transform([' '.join(words)])
X_new = selector.transform(X_new)
y_new = clf.predict(X_new)
print('Predicted Label:', y_new[0])

['创作', '艺术', '社会', '包容']
Predicted Label: C3-Art


In [12]:
# 读取train数据集
test_data = []
test_labels = []
for root, dirs, files in os.walk('./Chinese_Text_Classification_Task_Dataset/test'):
    print('dealing with {}'.format(root.split('/')[-1]))
    for file in files:
        with open(os.path.join(root, file), 'r', encoding='GB2312',errors='ignore') as f:
            content = f.read()
            # 中文分词
            words = jieba.lcut(content)
            # 过滤停用词和数字
            words = [word for word in words if word not in stopwords and not word.isdigit()]

            test_data.append(' '.join(words))
            test_labels.append(root.split('/')[-1])

dealing with test
dealing with C19-Computer
dealing with C3-Art
dealing with C39-Sports
dealing with C31-Enviornment
dealing with C38-Politics
dealing with C34-Economy


In [24]:
y_pred = []
for i in range(len(test_data)):
    X_new = vectorizer.transform([' '.join(words)])
    X_new = selector.transform(X_new)
    y_new = rf_clf.predict(X_new)
    y_pred.append(y_new[0])

In [25]:
cm = confusion_matrix(test_labels, y_pred, labels=labels)
print("Confusion matrix:")
print(cm)

Confusion matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
