<h1>This notebook is to extract the detailed reason keywords in each case file.</h1>

In [17]:
import pandas as pd
import numpy as np
import json
import os
import spacy
import zh_core_web_trf
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.language import Language
from spacy.attrs import ORTH
import re
from spacy.tokens import Doc
from spacy.pipeline import EntityRuler
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.symbols import POS
from spacy.strings import StringStore
from spacy.pipeline import Tagger

In [4]:
base_url = "/Users/starice/OwnFiles/cityu/RA/"
pre_dir = ['type1', 'type2', 'type3', 'type4']
dir_name = ['2014', '2015', '2016', '2017', '2018', '2019', '2020']
dir_sname = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

In [5]:
# 目前发现的问题
# 1. 如果法院认为内容中的法律条款内容里出现了reason keywords，并不能说明该案件就和出现的keyword有关联，需要再做区分才行。
#     例如type1 2014 1 json: 5911d51ec3666e1b15607ff5

In [6]:
# 想到的解决方法
# 1. 原文内容匹配，从文本中找出食品安全法相关的文本并和食品安全法原文做匹配，进一步探寻原因
#     有些案件只标明违反了第九十六条，具体原因不明
# 2. 老老实实用关键字匹配，不是所有的文本都标明了违反的具体条款
#     用匹配到的关键字出现的次数来决定最终属于哪几类违法原因（在有多于两个案件原因的情况下，
#     如果某些原因的关键字出现次数少于两次，就认为是混淆项，可以排除掉）

<h2>Build Reason Dictionary</h2>

In [13]:
# reason1: 标签、配料表、外包装违规（包括虚假、夸大信息，格式、名称规范问题等等）
# reason2: 假冒产品
# reason3: 保质期、生产日期
# reason4: 原材料、添加剂
# reason5: 商标
# reason6: 出入境检验、检疫证明、来源证明（进口食品或野生动植物）
# reason7: 生产许可证、生产标准、证明文件
# reason8: 餐饮食品安全卫生标准（只限餐饮）
# reason9: 是否为真实消费者（疑似打假人）
# reason10: 食品相关产品尚无国家标准或相关产品未通过安全性评估
# reason11: 产品质检不合格、质量有问题
# keyword1: 进口食品（不是单独的类别，只和普通食品的违法原因区分）

reasons = {
    "reason1": 
        ['标签', '产品标签营养成分表内容', '外包装食品标签', 
         '标记', '标识', '标题', '条形码', '食品外包装食品标签', 
         '标注', '卷标', '产品说明标签', '标签不符合', '标签不合格', 
         '未标注配料表', '食品配料表', '外包装配料表', '配料表未标注', 
         '外包装', '虚假', '虚假欺诈', '夸大', '夸大宣传', 
         '欺骗误导', '吹捧', '贬低', '夸大其词', '标签格式', '中文标签格式', 
         '中文标签版面格式', '营养标签格式', '文字格式包装', 
         '标签字体', '名称规范', '产品标准号', '说明书'], 
    "reason2": 
        ['假冒产品', '假冒伪劣产品', '假冒侵权产品', '伪劣产品'], 
    "reason3": 
        ['保质期', '保质期限', '生产日期', '过期'], 
    "reason4": 
        ['原始配料', '原料', '原材料', '原料未标注', 
         '非药食同源物质', '食品添加剂', '添加剂', '非食用', 
         '非食品', '食品原料'], 
    "reason5": 
        ['商标', '商标专用权', '商标违法', '商标侵权'], 
    "reason6": 
        ['出入境检验', '出入境检疫', '入境', '边检', 
         '入境检验', '入境检验检疫', '海关检疫', '检疫', '来源证明'], 
    "reason7": 
        ['生产许可', '许可生产', '生产经营许可'], 
    "reason8": 
        ['餐饮', '餐饮服务', '餐饮服务场所', '餐饮服务提供者', 
         '餐饮服务许可', '餐饮店', '餐饮食品安全卫生标准', 
         '食物中毒', '食源性疾病', '小吃', '饭店', '饭馆', 
         '火锅', '烧烤', '酒店', '人身安全', '财产安全'], 
    "reason9": 
        ['真实消费者', '是否为消费者', '是否是消费者'], 
    "reason10": 
        ['无国家标准', '安全性评估', '安全性评估审查', 
         '相关安全性评估', '安全性评估材料', '安全性评估报告', 
         '安全性评价', '安全评估', '质量评估'], 
    "reason11": 
        ['质检不合格', '质量不符合', '不符合质量', 
         '质量品质', '质量保证书', '质量合格证', '性状', 
         '外观性状', '感官性状', '化学性状', '产品性状', 
         '基本自然性状', '品系', '人身安全', '财产安全', 
         '检验', '检测', '证明材料', '供应商资质'], 
    "keyword1": 
        ['进口', '进出口', '进出口许可', 
         '进出口资质', '对外贸易', '食品进口', 
         '进口食品', '进口普通食品']
}

In [136]:
rePatterns = []
for i in list(reasons.keys())[:-1]:
    rePatterns.append([{"ORTH": {"IN": reasons[i]}}])
# rePatterns
importPattern = [{"ORTH": {"IN": reasons['keyword1']}}]

In [137]:
nlp = spacy.load("zh_core_web_lg")
# case reasons matcher
reMatcher = Matcher(nlp.vocab)
for i in range(1, len(rePatterns)+1):
    reMatcher.add("reason"+str(i), [rePatterns[i-1]])
    nlp.tokenizer.pkuseg_update_user_dict(reasons[list(reasons.keys())[i-1]])

keywordMatcher = Matcher(nlp.vocab)
keywordMatcher.add("isImport", [importPattern])
nlp.tokenizer.pkuseg_update_user_dict(reasons['keyword1'])

In [154]:
def extract_reasons(path):
    reasonDict = {}
    isImport = False
    with open(path, encoding = "UTF-8") as f:
        data = json.load(f)
        dict_data = json.loads(data)
        case_id = dict_data['returnData']['id']
        for i in dict_data['returnData']['segments']:
            if (i['type'] == "ASCERTAIN" or i['type'] == "COURT_HELD" or i['type'] == "courtHeld"):
                doc = nlp(i['text'])
                matcher = reMatcher(doc)
                kmatcher = keywordMatcher(doc)
                
                #如果有匹配到pattern
                if len(matcher) > 0:
                    for match_id, start, end in matcher:
                        string_id = nlp.vocab.strings[match_id]  # Get string representation
                        if string_id not in reasonDict.keys():
                            reasonDict[string_id] = 0
                        reasonDict[string_id] += 1
                if len(kmatcher) > 0:
                    isImport = True
        sortReasons = sorted(reasonDict.items(), key=lambda x: x[1], reverse=True)
        if len(sortReasons)==0: return case_id, [], False#空值可以在后续pandas dropna处理掉
        
        # reason dict filter
        reasonList = list(reasonDict.keys())
        maxTimes = sortReasons[0][1]
        if maxTimes > 2:
            for v in sortReasons:
                if (v[1] <= 2 or maxTimes-v[1] >3):
                    reasonList.remove(v[0])
        else:
            reasonList = [sortReasons[0][0]]
    return case_id, reasonList, isImport

In [155]:
# test extract_reasons
extract_reasons('/Users/starice/OwnFiles/cityu/RA/type1/2014/1/json/57baba77c2265c5f452d2f3a.json')

('57baba77c2265c5f452d2f3a', ['reason11'], False)

In [156]:
def bath_extract_reasons(dir_path):
    reasonsPd = pd.DataFrame(columns=["case_id", "case_reason", "is_import"])
    files = os.listdir(dir_path)
    for file in files: 
        if os.path.splitext(file)[-1][1:] != "json": continue
        if not os.path.isdir(file): #判断是否是文件夹，不是文件夹才打开
            file_path = dir_path + "/" + file
            case_id, reasonList, isImport = extract_reasons(file_path)
            reasonsPd = reasonsPd.append({"case_id": case_id, "case_reason": reasonList, "is_import": isImport}, 
                                        ignore_index=True)
    return reasonsPd

In [164]:
def process_multiple_cases(pre_dir, dir_name, dir_sname):
    for i in pre_dir:
        for j in dir_name:
            for k in dir_sname:
                dir_path = base_url + i + "/" + j + "/" + k + "/json"
                print("processing directory: ", dir_path)
                if not os.path.exists(dir_path):
                    print("路径不存在！", dir_path)
                    continue
                reasonsPd = bath_extract_reasons(dir_path)
                reasonsPd.to_csv("/Users/starice/Desktop/case_reasons/" + str(i) + "_" + str(j) + "_" + str(k) + ".csv")

In [171]:
process_multiple_cases(pre_dir[1:2], dir_name[3:4], dir_sname[:])

processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/1/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/2/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/3/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/4/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/5/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/6/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/7/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/8/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/9/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/10/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/11/json
processing directory:  /Users/starice/OwnFiles/cityu/RA/type2/2017/12/json
