# Task2：正则关键词与文本分类
正则表达式（Regular Expressions，简称regex）是一种用于字符串搜索和操作的强大工具。它使用单个字符串来描述、匹配一系列符合某个句法规则的字符串。正则表达式在计算机科学、编程、数据挖掘和文本处理中有着广泛的应用。

- **定义规则**：根据分类需求，定义一组正则表达式规则。
- **预处理文本**：对输入文本进行清洗，如去除标点符号、转换为小写等。
- **模式匹配**：使用正则表达式在文本中搜索定义的模式。
- **分类决策**：根据匹配结果，将文本分配到相应的类别。

使用正则表达式进行文本分类时，确定关键词是一个关键步骤，因为它直接影响到分类的准确性和效率。可以从分析中找出每个类别的高频词汇，或考虑类别相关的专业术语或行业特定的词汇。

In [2]:
# 数据加载
# 读取数据集，这里是直接联网读取，也可以通过下载文件，再读取
import pandas as pd

data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)

#  统计训练集的Intent分布
print(train_data[1].value_counts())
print(train_data[1].unique())
# 统计训练集的文本长度分布
print(train_data[0].apply(len).describe())

1
FilmTele-Play            1355
Video-Play               1334
Music-Play               1304
Radio-Listen             1285
Alarm-Update             1264
Weather-Query            1229
Travel-Query             1220
HomeAppliance-Control    1215
Calendar-Query           1214
TVProgram-Play            240
Audio-Play                226
Other                     214
Name: count, dtype: int64
['Travel-Query' 'Music-Play' 'FilmTele-Play' 'Video-Play' 'Radio-Listen'
 'HomeAppliance-Control' 'Weather-Query' 'Alarm-Update' 'Calendar-Query'
 'TVProgram-Play' 'Audio-Play' 'Other']
count    12100.000000
mean        15.138678
std          4.872583
min          4.000000
25%         12.000000
50%         15.000000
75%         18.000000
max         54.000000
Name: 0, dtype: float64


In [3]:
# 关键词词典构建
from collections import Counter 
import re 
import string 
import jieba 

def to_patterns(data):
    content = ''.join(list(data[0]))
    # 使用正则表达式去除所有空格、换行符和标点符号
    # 将所有的标点符号加入到正则表达式中
    punctuation = re.escape(string.punctuation) + "，。" # 对特殊字符进行转义

    # 创建正则表达式以匹配所有空格、换行符和标点符号
    content = re.sub(f'[{punctuation}\s]', '', content)

    words = jieba.lcut(content) 
    # 读取中文停用词
    cn_stopwords = ''.join(pd.read_csv('https://mirror.coggle.club/stopwords/baidu_stopwords.txt', header=None)[0])
    words = [x for x in words if x not in cn_stopwords]
    words = [x for x in words if len(x)>1]       
    words = [x for x in words if not x.isdigit()]
    
    word_counts = Counter(words)
    
    train_words = [x for x in words if word_counts[x]>5]  
    return train_words  
     
train_words = to_patterns(train_data)

# 构建词频词典
train_words_prior = {} 

for row in train_data.iloc[:].itertuples():    
    text , label  = row[1] , row[2]
    words = jieba.lcut(text)
    words = [x for x in words if x in train_words]  
    
    if len(words) == 0 : 
        continue 
    
    for word in words : 
        if word not in train_words_prior : 
            train_words_prior[word] = {"total" : 0 }  
            
        if label not in train_words_prior[word] : 
            train_words_prior[word][label] = 0 
        
        train_words_prior[word]["total"] += 1 
        train_words_prior[word][label] += 1  

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/fx/x5xt5flx4_s_bhn2f7sx897r0000gn/T/jieba.cache
Loading model cost 0.225 seconds.
Prefix dict has been built successfully.


In [4]:
train_words_prior  = pd.DataFrame(train_words_prior).T 
train_words_prior.fillna(0,inplace= True) 

In [5]:
for category  in train_data[1].unique():
    train_word_prior[category] /= 
     
    


Unnamed: 0,total,Travel-Query,Music-Play,Alarm-Update,Video-Play,FilmTele-Play,Radio-Listen,Audio-Play,Other,TVProgram-Play,Weather-Query,HomeAppliance-Control,Calendar-Query
汽车票,39.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
回家,20.0,8.0,3.0,7.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
随便,91.0,0.0,30.0,0.0,26.0,15.0,9.0,6.0,4.0,1.0,0.0,0.0,0.0
播放,1729.0,0.0,464.0,0.0,310.0,589.0,180.0,58.0,6.0,69.0,48.0,5.0,0.0
一首,406.0,0.0,391.0,0.0,0.0,0.0,0.0,3.0,12.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
放来,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
一年,6.0,1.0,1.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
有人,6.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0
媒体,6.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0
