# ecom_senti语料准备

##  特征词/情感词 阅读理解语料生成




### 准备相关文件

1. 为本次任务指定文件夹, 文件夹名一般就是<task_name>, 后面模型训练和评估会用到
2. 准备原始标注语料 general.json, 可以通过nlptools dump_dataset直接导出, 上传到步骤1目录
3. 从nlp_label数据库下载当前品类的相关排除词文件 excludes.csv，上传到步骤1目录


In [None]:
import json
import os

task_name = 'ecom_912'
test_dir = f'/data/projects/bert_pytorch/{task_name}'   
corpus_path = f'/data/projects/bert_pytorch/{task_name}/general.json'  # general.json 为当前训练任务对应的标注json line文件
excludes_path = f'/data/projects/bert_pytorch/{task_name}/excludes.csv' # 排除词路径

# 在SUBTYPE.json文件内写入当前品类的subtype说明
category_name = 'general'
trans_sub = json.load(open('SUBTYPE.json'))[category_name]
trans_sub

### 按照各个subtype提取正样本


In [None]:
for subtype in trans_sub:
    print(f'为{subtype}提取正样本')
    note = subtype.replace(' ', '_').replace('/','.')
    command = f'./extract_subtype_ecom.sh {note} bert_pytorch/{task_name}'
    os.system(command)

### 根据各个subtype提取负样本

#### 从混淆subtype寻找负样本
1. 同属于一个type的其他subtype样本,
2. 不属于一个type的其他subtype容易混淆的样本
3. 其他subtype的样本

In [None]:
# nega_sample_dict 定义了提取负样本的优先选择次序, 如对于 Logistic Service, 
# 优先从较容易混淆的 Shop/Customer Service中选择负样本, 便于模型能学习到两者的差异
# 如果没为SUBTYPE指定其优先选择负样本subtype, 则会从相同type不同subtype中选取
nega_sample_dict = {
    'Wrong Delivery': ['Logistics Fee', 'Promotion'],
    'Logistics Package': [
        "Package Cleanliness",
        "Package Design",
        "Package General",
        "Package Integrity",
        "Package Material",
        "Package Printing"
    ],
    'Logistics Service': ['Shop/Customer Service'],
}

# TODO: 此处需要整合到SUBTYPE.json文件
# general_subtypes = [
#      ("Product","Fat Granule", "脂肪粒"  ),
#      ("Product", "Greasy", "油腻"),
#      ("Product", "Irritation", "刺激"),
#      ("Product", "Moisturization", "保湿"),
#      ("Product", "Smell", "气味"),
#      ("Product", "Whitening", "肤色改善")]
general_subtypes = [
    ('Branding', 'Brand Equity', '品牌资产'),
    ('Branding', 'Loyalty', '品牌忠诚度'),
    ('Branding', 'New User', '品牌新用户'),
    ('Branding', 'WOM', '品牌口碑'),
    ('Authenticity', 'Fake Concern', '假货'),
    ('Inventory', 'Inventory', '库存'),
    ('Inventory', 'Expiration Date', '保质期'),
    ('Logistics', 'Logistics Speed', '快递送货速度'),
    ('Logistics', 'Pick-up Speed', '快递发货速度'),
    ('Logistics', 'Wrong Delivery', '快递错发漏发'),
    ('Logistics', 'Logistics Fee', '快递费用'),
    ('Logistics', 'Logistics Service', '快递服务'),
    ('Logistics', 'Logistics Company',  '快递公司'),
    ('Logistics', 'Logistics Package', '快递包装'),
    ('Logistics', 'Logistics Damage', '快递破损'),
    ('Package', 'Package Cleanliness', '包装清洁度'),
    ('Package', 'Package Design',  '包装设计'),
    ('Package', 'Package Integrity', '包装完整度'),
    ('Package', 'Package Material', '包装材质'),
    ('Package', 'Package Printing', '包装印刷'),
    ('Package', 'Package General',  '包装概览'),
    ('Price', 'Price Satisfaction', '价格满意度'),
    ('Price', 'Price Sensitivity', '价格敏感度'),
    ('Promotion', 'Promotion', '促销'),
    ('Service', 'Shop/Customer Service', '店铺或客服服务'),
    ('Service', 'Return Exchange', '退换货服务')
]

for dirname in os.listdir(test_dir):
    if os.path.isdir(os.path.join(test_dir, dirname)):
        subtype = dirname.replace('.','/').replace('_',' ')
        level_1 = list(filter(lambda x:x[1]== subtype,  general_subtypes))[0][0]
        print('寻找负样本', subtype, level_1)
        
        samples = [i[1] for i in filter(lambda x: x[0]==level_1 and x[1]!= subtype,  general_subtypes)] + nega_sample_dict.get(subtype,[])
        print(samples)
        
        nega_file_path = os.path.join(test_dir, dirname, 'nega_samples.json')
        command = f'touch {nega_file_path}'  # 为没有排除词的subtype建立文件进行占位
        os.system(command)
        for s in samples:
            command = f'grep -v "{subtype}" {corpus_path}| grep "{s}" >> {nega_file_path}'
            !{command}
        

#### 根据排除词寻找各个subtype的负样本

In [None]:
import pandas as pd
df_exclude = pd.read_csv(excludes_path)
df_exclude = df_exclude[df_exclude.aspect_subtype.isin(trans_sub)]
df_exclude.groupby('aspect_subtype').count()

In [None]:

for dirname in os.listdir(test_dir):
    if os.path.isdir(os.path.join(test_dir, dirname)):
        subtype = dirname.replace('.','/').replace('_',' ')
        terms = df_exclude[df_exclude.aspect_subtype == subtype]['term']
        print(f'正在处理subtype {subtype}, 找到特征词 {len(terms)}')
        result_path= os.path.join(test_dir, dirname, 'exclude.json')
        command = f'touch {result_path}'  # 为没有排除词的subtype建立文件进行占位
        os.system(command)
        for term in terms:
            re_exp = term.strip(' ').replace('  ',' ').replace(' ','.{0,20}?')
            command = f"grep '{re_exp}'  {corpus_path} >> {result_path}"
            os.system(command)


### 负样本合并到训练集
负样本来源
1. 排除词
2. 混淆subtype
3. 非相关subtype

In [None]:
import os
for dirname in os.listdir(test_dir):
    if os.path.isdir(os.path.join(test_dir, dirname)):
        subtype = dirname.replace('.','/').replace('_',' ')

        new_file = os.path.join(test_dir, dirname, 'nega_total.json')
        old_file = os.path.join(test_dir, dirname, f'{dirname}.json')
        ext_file1 = os.path.join(test_dir, dirname, f'exclude.json')
        ext_file2 = os.path.join(test_dir, dirname, f'nega_samples.json')
        add_file =  os.path.join(test_dir, dirname, f'nega_add.json')
        result_file = os.path.join(test_dir, dirname, f'shuf_{dirname}.json')
        
        train_file = os.path.join(test_dir, dirname, 'train.json')
        dev_file = os.path.join(test_dir, dirname, 'dev.json')

        # 负样本略少于正样本10:9
        r = !wc -l {old_file}   
        posi_num = int(r[0].split()[0])
        nega_num = int(posi_num* 0.9)
        print('正样本', posi_num, subtype)
        
        # 如果上面两个来源的负样本数不足,则在进行随机选择负样本补充
        r1 = !wc -l {ext_file1}
        ext_num1 = int(r1[0].split()[0])
        r2 = !wc -l {ext_file2}
        ext_num2 = int(r2[0].split()[0])
        if ext_num1 + ext_num2 < nega_num:
            add_num = nega_num - (ext_num1 + ext_num2)
            print(f'{subtype} subtype负样本不足, 补充{add_num}' )
            command = f'grep -v "{subtype}" {corpus_path} | head -n {add_num*2} > {add_file}'
            os.system(command)
            command = f"sort '{ext_file1}' '{ext_file2}' '{add_file}'  |uniq|head -n {nega_num} > {new_file}"
            os.system(command)

        else:
            command = f"sort '{ext_file1}' '{ext_file2}' |uniq|head -n {nega_num} > {new_file}"
            os.system(command)

        # 按照8:2 拆分训练和测试集
        train_num = int((posi_num + nega_num)*0.8)
        dev_num = int((posi_num + nega_num)*0.2)
        command = f"sort {new_file} {old_file}| uniq| shuf  > {result_file}"
        os.system(command)
        os.system(f'head -n {train_num} {result_file} > {train_file}')
        os.system(f'tail -n {dev_num} {result_file} > {dev_file}')

        

In [None]:
## 检查训练/测试集 正负样本数, 及时发现异常
for dirname in os.listdir(test_dir):
    if os.path.isdir(os.path.join(test_dir, dirname)):
        subtype = dirname.replace('.','/').replace('_',' ')
        
        train_file = os.path.join(test_dir, dirname, 'train.json')
        dev_file = os.path.join(test_dir, dirname, 'dev.json')

        command = f"grep -v '{subtype}' {train_file}|wc -l"
        r = !{command}
        print(r, subtype, '负','train')
        command = f"grep  '{subtype}' {train_file}|wc -l"
        r = !{command}
        print(r, subtype, '正','train')
        
        command = f"grep -v '{subtype}' {dev_file}|wc -l"
        r = !{command}
        print(r, subtype, '负','dev')
        command = f"grep  '{subtype}' {dev_file}|wc -l"
        r = !{command}
        print(r, subtype, '正', 'dev')
        


以上, 任务目录下各个subtype文件夹已经生成, train.json和dev.json分别对应了训练和测试集的语料

## 情感极性分类语料生成



#### 准备数据集

In [None]:
import json
from utils_ecom_senti import find_positions
from data_preprocess import convert_text

polar_corpus = []

with open(corpus_path, 'r') as cc:
    for line in cc:
        jl = json.loads(line)
        text = jl['text']
        clear_text = convert_text(text)
        for op in jl['opinions']:
            start, end = find_positions(clear_text, op['opinionTerm'])
            if start != -2:
                subtype = op['aspectSubtype']
                polar = op['polarity']
                term = clear_text[start:end]
                polar_corpus.append((term, clear_text, polar, subtype))

In [None]:
df_polar = pd.DataFrame(polar_corpus ,columns=['opterm','textb','label','subtype'])
len(df_polar)

In [None]:
import os
ratio = 0.8 # 8:2划分训练/测试集
corpus_dir = os.path.dirname(corpus_path)
for dirname in os.listdir(corpus_dir):
    output_dir = os.path.join(corpus_dir, dirname)
    subtype = dirname.replace('.', '/').replace('_'," ")
    if subtype not in trans_sub:
        continue
    print(dirname)
    temp  = df_polar[df_polar.subtype == subtype]
    temp = temp.sample(frac=1)
    train_limit = int(ratio * len(temp))
    temp[:train_limit].to_csv(os.path.join(output_dir, 'train.csv'), index=False)
    temp[train_limit:].to_csv(os.path.join(output_dir, 'dev.csv'), index=False)


以上, 任务目录下各个subtype文件夹已经生成, train.csv 和dev.csv分别对应了训练和测试集的语料