In [29]:
import importlib
import business
importlib.reload(business)

<module 'business' from '/Users/a92461/Desktop/硕汉/短信审核/business.py'>

In [26]:
# 第一轮规则审核代码 
import pandas as pd
import os
from datetime import datetime
import re
from typing import Tuple, Dict, List
from business import validate_business
from tqdm import tqdm

def first_round_rule_audit(input_file: str) -> pd.DataFrame:
    """
    执行第一轮规则审核并返回完整的审核结果DataFrame
    
    Args:
        input_file: 输入Excel文件路径
    
    Returns:
        df: 包含所有短信审核结果的DataFrame
    """
    # 读取Excel文件
    print(f"读取文件: {input_file}")
    df = pd.read_excel(input_file)
    print(f"成功读取文件，共 {len(df)} 条记录")
    
    # 创建结果列
    df['总体操作类型'] = None
    df['业务操作类型'] = None
    
    # 审核计数器
    pass_count = 0
    fail_count = 0
    error_count = 0
    
    print("=============== 开始执行第一轮：规则审核==============")
    
    # 对每条短信执行规则审核
    for index, row in tqdm(df.iterrows(), total=len(df), desc="规则审核进度", ncols=100):
        
            
        try:
            # 调用业务规则审核
            business_passed, business_reason = validate_business(
                row['产品类型'],
                row['短信内容'], 
                row['短信签名'],
                row.get('账户类型')
            )
            
            # 更新结果
            if business_passed:
                df.loc[index, '总体操作类型'] = '放行'
                pass_count += 1
            else:
                df.loc[index, '总体操作类型'] = '失败'
                fail_count += 1
                
            df.loc[index, '业务操作类型'] = business_reason
            
        except Exception as e:
            print(f"处理行 {index} 时出错: {str(e)}")
            df.loc[index, '总体操作类型'] = '处理错误'
            df.loc[index, '业务操作类型'] = f"处理错误: {str(e)}"
            error_count += 1
    
    # 输出统计信息
    total = len(df)
    print(f"\n规则审核完成:")
    print(f"- 总记录数: {total}")
    print(f"- 通过数量: {pass_count} ({pass_count/total*100:.2f}%)")
    print(f"- 失败数量: {fail_count} ({fail_count/total*100:.2f}%)")
    if error_count > 0:
        print(f"- 错误数量: {error_count} ({error_count/total*100:.2f}%)")
    
    #可选：保存结果到Excel文件
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"规则审核_{timestamp}.xlsx"
    df.to_excel(output_file, index=False)
    print(f"完整审核结果已保存至: {output_file}")
    
    
    # 统计不同情况的数量
    code_pass_human_fail = len(df[(df['总体操作类型'] == '放行') & (df['操作类型'] == '失败')])
    code_fail_human_pass = len(df[(df['总体操作类型'] == '失败') & (df['操作类型'] == '放行')])
    matched = len(df[df['总体操作类型'] == df['操作类型']])
    match_rate = (matched / total) * 100
    
    # 打印结果
    print('\n审核结果统计:')
    print(f'总样本数: {total}')
    print(f'代码放行但人工失败数量: {code_pass_human_fail}')
    print(f'代码失败但人工放行数量: {code_fail_human_pass}')
    print(f'匹配数量: {matched}')
    print(f'匹配率: {match_rate:.2f}%')


    return df

# 使用示例 
df_with_resultsƒ = first_round_rule_audit("3月审核记录.xlsx")



读取文件: 3月审核记录.xlsx
成功读取文件，共 8133 条记录


规则审核进度:   2%|▋                                           | 133/8133 [00:00<00:06, 1325.21it/s]

处理行 187 时出错: 'score'


规则审核进度:   5%|██▏                                         | 396/8133 [00:00<00:06, 1277.80it/s]

处理行 319 时出错: 'score'


规则审核进度:  21%|████████▊                                  | 1671/8133 [00:01<00:05, 1214.16it/s]

处理行 1434 时出错: 'score'
处理行 1652 时出错: 'score'


规则审核进度:  27%|███████████▌                               | 2184/8133 [00:01<00:04, 1271.23it/s]

处理行 1927 时出错: 'score'
处理行 1948 时出错: 'score'


规则审核进度:  33%|██████████████▎                            | 2696/8133 [00:02<00:04, 1261.10it/s]

处理行 2495 时出错: 'score'


规则审核进度:  38%|████████████████▎                          | 3084/8133 [00:02<00:03, 1270.72it/s]

处理行 2959 时出错: 'score'
处理行 2960 时出错: 'score'
处理行 2962 时出错: 'score'
处理行 3121 时出错: 'score'
处理行 3122 时出错: 'score'
处理行 3126 时出错: 'score'
处理行 3130 时出错: 'score'


规则审核进度:  42%|██████████████████▎                        | 3453/8133 [00:02<00:04, 1157.34it/s]

处理行 3312 时出错: 'score'


规则审核进度:  45%|███████████████████▌                       | 3693/8133 [00:03<00:03, 1165.01it/s]

处理行 3560 时出错: 'score'


规则审核进度:  57%|████████████████████████▋                  | 4665/8133 [00:03<00:03, 1126.47it/s]

处理行 4502 时出错: 'score'
处理行 4503 时出错: 'score'
处理行 4504 时出错: 'score'


规则审核进度:  62%|██████████████████████████▍                | 5003/8133 [00:04<00:02, 1102.80it/s]

处理行 4879 时出错: 'score'
处理行 4880 时出错: 'score'
处理行 4883 时出错: 'score'
处理行 4921 时出错: 'score'


规则审核进度:  90%|██████████████████████████████████████▉    | 7357/8133 [00:06<00:00, 1140.06it/s]

处理行 7163 时出错: 'score'


规则审核进度: 100%|███████████████████████████████████████████| 8133/8133 [00:06<00:00, 1180.93it/s]



规则审核完成:
- 总记录数: 8133
- 通过数量: 6302 (77.49%)
- 失败数量: 1807 (22.22%)
- 错误数量: 24 (0.30%)
完整审核结果已保存至: 规则审核_20250408_152746.xlsx

审核结果统计:
总样本数: 8133
代码放行但人工失败数量: 362
代码失败但人工放行数量: 974
匹配数量: 6773
匹配率: 83.28%


In [None]:

import time
from business import validate_business
from typing import Tuple, Dict
import pandas as pd
import os
from datetime import datetime
import re
from collections import Counter
from ai_check import AIAuditor
import json

# 2. 准备AI审核数据
def prepare_ai_data(sample_df):
    """准备AI审核数据"""
    ai_audit_list = []
    for _, row in sample_df.iterrows():
        try:
            score = 100.0  # 默认分数
            try:
                score_match = re.search(r'总分: (\d+\.?\d*)', row['业务操作类型'])
                if score_match:
                    score = float(score_match.group(1))
                elif "直接通过" in row['业务操作类型']:
                    score = 100.0  # 直接通过给高分
            except Exception as e:
                print(f"提取分数时出错: {str(e)}")
            
            ai_audit_list.append({
                "signature": row['短信签名'],
                "content": row['短信内容'],
                "business_type": row['产品类型'],
                "rule_score": score,
                "rule_reason": row['业务操作类型']
            })
        except Exception as e:
            print(f"准备AI审核数据时出错: {str(e)}")
    return ai_audit_list


# 4. 测试小批量审核
def test_batch_audit(df):
    """测试小批量审核"""    
    # 然后在下一个单元格中可以随机抽取部分放行的短信进行AI二次审核
    sample_size = 5  # 设置样本大小
    passed_df = df[df['总体操作类型'] == '放行']
    sample_df = passed_df.sample(min(sample_size, len(passed_df)))
    print(f"已随机抽取 {len(sample_df)} 条通过规则审核的短信用于AI二次审核")
    
    # 准备AI审核数据
    ai_audit_list = prepare_ai_data(sample_df)

    # 批量审核
    print("============= 开始第二轮：AI审核 =============")
    auditor = AIAuditor()
    start_time = time.time()
    results = auditor.batch_audit(ai_audit_list)
    elapsed = time.time() - start_time
    
    # 统计结果
    pass_count = sum(1 for result in results if result['passed'])
    reject_count = len(results) - pass_count
    
    print(f"审核完成: 通过 {pass_count} 条, 驳回 {reject_count} 条")
    print(f"总耗时: {elapsed:.2f}秒, 平均每条: {elapsed/len(results):.2f}秒")
    
    # 显示详细结果
    print("\n详细结果:")
    for i, result in enumerate(results):
        sms = result['sms']
        print(f"\n短信 {i+1}:")
        print(f"签名: {sms['signature']}")
        print(f"内容: {sms['content'][:50]}..." if len(sms['content']) > 50 else f"内容: {sms['content']}")
        print(f"业务类型: {sms['business_type']}")
        print(f"审核结果: {'通过' if result['passed'] else '驳回'}")
        if not result['passed'] and 'reasons' in result['details']:
            print(f"驳回原因: {', '.join(result['details']['reasons'])}")
    
    return results

# 运行测试
# test_single_audit()
# test_batch_audit(df_with_results)

In [34]:
# 3. 测试单条短信审核
from business import validate_business
import re

def test_single_audit():
    """测试单条短信审核"""
    # 测试一条短信
    test_sms = {
        "signature": "缪偲科技",
        "content": "【缪偲科技】您的验证码为：午夜激晴，韩，欧高清无吗 118.145.211.233 ,有效期为 n 分钟,请确保是本人操作,不要把验证码泄露给其他人",
        "business_type": "行业-通知",
        "account_type": None  
    }
    print("=============== 短信信息 =============")
    print(f"签名: {test_sms['signature']}")
    print(f"内容: {test_sms['content']}")
    print(f"业务类型: {test_sms['business_type']}")
    if test_sms.get("account_type"):
        print(f"账户类型: {test_sms['account_type']}")
    print("\n")

    print("=============== 第一轮：规则审核 =============")
    # 执行规则审核
    business_passed, business_reason = validate_business(
        test_sms["business_type"],
        test_sms["content"],
        test_sms["signature"],
        test_sms.get("account_type")
    )
    
    # 提取规则审核分数
    score = 100.0  # 默认分数
    try:
        score_match = re.search(r'总分: (\d+\.?\d*)', business_reason)
        if score_match:
            score = float(score_match.group(1))
        elif "直接通过" in business_reason:
            score = 100.0
    except Exception as e:
        print(f"提取分数时出错: {str(e)}")
    
    print(f"规则审核结果: {'通过' if business_passed else '失败'}")
    print(f"规则审核原因: {business_reason}")
    print(f"规则审核分数: {score}")
    
    # 如果规则审核通过，进行AI审核
    if business_passed:
        print("\n=============== 第二轮：AI审核 =============")
        auditor = AIAuditor()
        start_time = time.time()
        passed, details = auditor.audit_sms(
            test_sms["signature"],
            test_sms["content"],
            test_sms["business_type"],
            score,
            business_reason
        )
        elapsed = time.time() - start_time
        
        print(f"AI审核结果: {'通过' if passed else '驳回'}")
        print(f"AI审核详情: {json.dumps(details, ensure_ascii=False, indent=2)}")
        print(f"AI审核耗时: {elapsed:.2f}秒")
    else:
        print("\n规则审核未通过，不进行AI审核")

    # 输出最终结果
    print("\n=============== 最终审核结果 =============")
    if not business_passed:
        print("审核结果: 规则审核未通过")
        print(f"失败原因: {business_reason}")
    else:
        print(f"审核结果: {'通过' if passed else 'AI审核未通过'}")
        if not passed:
            print(f"失败原因: {', '.join(details.get('reasons', []))}")

# 运行测试
test_single_audit()

签名: 缪偲科技
内容: 【缪偲科技】您的验证码为：午夜激晴，韩，欧高清无吗 118.145.211.233 ,有效期为 n 分钟,请确保是本人操作,不要把验证码泄露给其他人
业务类型: 行业-通知


规则审核结果: 失败
规则审核原因: 审核不通过 (总分: 55.73)
规则审核分数: 55.73

规则审核未通过，不进行AI审核

审核结果: 规则审核未通过
失败原因: 审核不通过 (总分: 55.73)


In [3]:
from business import BusinessValidator 


validator = BusinessValidator()
content = "【中国游协】CAAPA游乐展即将举办，[::1]:8080 或 127.0.0.1 查看监控，。600+企业参展，覆盖主题公园、景区、度假区、水乐园等文旅项目装备及服务，助您一站式升级新体验！同期举办低空、文商旅、沉浸、城市公园等13+专项论坛，快捷登记 拒收请回复R"
has_link, link_count = validator._contains_link(content)
print(f"Contains link: {has_link}, Link count: {link_count}")  # Should detect the IP address

Contains link: True, Link count: 2
