In [2]:
import pandas as pd
import csv
import io
import re
from typing import Dict, List

class SimpleTSVValidator:
    def __init__(self, exclude_keys=None):
        self.test_cases = []
        self.exclude_keys = exclude_keys if exclude_keys is not None else ['click_type']
        
    def load_tsv_file(self, file_path: str):
        try:
            df = pd.read_csv(file_path, sep='\t', encoding='utf-8', header=0)
            df.columns = df.columns.str.rstrip(',').str.strip()
            
            for col in df.columns:
                if df[col].dtype == 'object':
                    df[col] = df[col].astype(str).str.rstrip(',').str.strip()
            
            for idx, row in df.iterrows():
                def safe_str(value):
                    if pd.isna(value) or str(value).lower() == 'nan':
                        return ''
                    return str(value).strip()
                
                expected_keys_str = safe_str(row['keys-정답'])
                expected_keys = [k.strip() for k in expected_keys_str.split(',') if k.strip() and k.strip().lower() != 'nan']
                
                actual_keys_str = safe_str(row['keys_combined'])
                actual_keys = [k.strip() for k in actual_keys_str.split(',') if k.strip() and k.strip().lower() != 'nan']
                
                # 제외할 키들 필터링
                actual_keys_filtered = [k for k in actual_keys if k not in self.exclude_keys]
                
                values_str = safe_str(row['values_combined'])
                values = self._parse_values_by_key_count(values_str, len(actual_keys))
                
                test_case = {
                    'unique_id': idx + 1,
                    '기능': safe_str(row['기능']),
                    '경로': safe_str(row['경로']),
                    '활동': safe_str(row['활동']),
                    'page_id': safe_str(row['page_id']),
                    'act_type': safe_str(row['act_type']),
                    'click_type': safe_str(row['click_type']),
                    'expected_keys': expected_keys,
                    'actual_keys': actual_keys_filtered,
                    'actual_keys_original': actual_keys,
                    'values': values
                }
                self.test_cases.append(test_case)
                
        except Exception as e:
            print(f"❌ 오류: {e}")
            raise
    
    def _parse_values_by_key_count(self, values_str: str, expected_count: int):
        if not values_str or expected_count <= 0:
            return []
        
        # 모든 콤마로 분할
        all_parts = [v.strip() for v in values_str.split(',') if v.strip()]
        
        if len(all_parts) == expected_count:
            return all_parts
        elif len(all_parts) < expected_count:
            while len(all_parts) < expected_count:
                all_parts.append('')
            return all_parts
        else:
            # 초과분을 앞쪽에 합치기 (뒤쪽 값들이 보통 더 안전함)
            excess = len(all_parts) - expected_count
            merged_first = ', '.join(all_parts[:excess + 1])
            result = [merged_first] + all_parts[excess + 1:]
            return result
    
    def validate_and_export(self, output_file: str):
        results = []
        
        def safe_str_result(value):
            if pd.isna(value) or str(value).lower() == 'nan':
                return ''
            return str(value).strip()
        
        for test_case in self.test_cases:
            unique_id = test_case['unique_id']
            actual_keys = test_case['actual_keys']  # 이미 필터링된 키들
            expected_keys = test_case['expected_keys']
            values = test_case['values']
            actual_keys_original = test_case['actual_keys_original']
            
            # expected_keys에서도 제외할 키들 필터링 ✅ 수정됨
            expected_keys_filtered = [k for k in expected_keys if k not in self.exclude_keys]
            
            if len(actual_keys) == 0 and len(expected_keys_filtered) > 0:
                results.append({
                    '고유번호': unique_id,
                    '기능': safe_str_result(test_case['기능']),
                    '경로': safe_str_result(test_case['경로']),
                    '활동': safe_str_result(test_case['활동']),
                    'page_id': safe_str_result(test_case['page_id']),
                    'act_type': safe_str_result(test_case['act_type']),
                    'click_type': safe_str_result(test_case['click_type']),
                    'key': 'LOG_MISSING',
                    'value': '로그 누락',
                    'pass': 'FAIL'
                })
                continue
            
            if len(actual_keys) == 0 and len(expected_keys_filtered) == 0:
                continue
            
            # key-value 매핑: 원본 키들과 값들을 1:1 매핑
            key_value_map = {}
            for i, key in enumerate(actual_keys_original):
                if i < len(values):
                    key_value_map[key] = safe_str_result(values[i])
                else:
                    key_value_map[key] = ''
            
            # 필터링된 키들로 집합 생성 ✅ 수정됨
            actual_keys_set = set(actual_keys)  # 이미 제외 키 필터링된 키들
            expected_keys_set = set(expected_keys_filtered)  # 제외 키 필터링된 예상 키들
            
            # expected_keys_filtered 검증 ✅ 수정됨
            for key in expected_keys_filtered:
                # key_value_map에서 값 추출 (원본 키들로 매핑된 것에서)
                value = key_value_map.get(key, 'MISSING')
                # actual_keys_set에 있는지 확인 (제외 키 필터링된 것에서)
                pass_status = 'PASS' if key in actual_keys_set or key in ['channel', 'page_url'] else 'FAIL'
                
                results.append({
                    '고유번호': unique_id,
                    '기능': safe_str_result(test_case['기능']),
                    '경로': safe_str_result(test_case['경로']),
                    '활동': safe_str_result(test_case['활동']),
                    'page_id': safe_str_result(test_case['page_id']),
                    'act_type': safe_str_result(test_case['act_type']),
                    'click_type': safe_str_result(test_case['click_type']),
                    'key': key,
                    'value': safe_str_result(value),
                    'pass': pass_status
                })
            
            unexpected_keys = actual_keys_set - expected_keys_set
            for key in unexpected_keys:
                value = key_value_map.get(key, '')
                if key in ['channel', 'page_url']:
                    pass_status = 'PASS'
                else:
                    pass_status = 'UNEXPECTED'
                results.append({
                    '고유번호': unique_id,
                    '기능': safe_str_result(test_case['기능']),
                    '경로': safe_str_result(test_case['경로']),
                    '활동': safe_str_result(test_case['활동']),
                    'page_id': safe_str_result(test_case['page_id']),
                    'act_type': safe_str_result(test_case['act_type']),
                    'click_type': safe_str_result(test_case['click_type']),
                    'key': key,
                    'value': safe_str_result(value),
                    'pass': pass_status
                })
        
        df_results = pd.DataFrame(results)
        df_results.to_excel(output_file, index=False, engine='openpyxl')
        
        total = len(df_results)
        passed = len(df_results[df_results['pass'] == 'PASS'])
        print(f"검증 완료: 전체 {total}개, 통과 {passed}개 ({passed/total*100:.1f}%)")
        print(f"제외된 키들: {self.exclude_keys}")
        
        return results

# 사용 예시
def run_validator():
    # channel, page_url, os_name 제거
    validator = SimpleTSVValidator(exclude_keys=['os_name', 'page_id'])
    validator.load_tsv_file("tester.tsv")
    validator.validate_and_export("./result/qa_result.xlsx")

if __name__ == "__main__":
    run_validator()

검증 완료: 전체 207개, 통과 68개 (32.9%)
제외된 키들: ['os_name', 'page_id']
