In [1]:
import pandas as pd
 
inboedel_parsed = pd.read_excel("inboedel_responses_parsed.xlsx")
reis_parsed = pd.read_excel("reis_responses_parsed.xlsx")
inboedel_gt = pd.read_excel("groundtruth_inboedel_enriched.xlsx")
reis_gt = pd.read_excel("groundtruth_reis_enriched.xlsx")

FileNotFoundError: [Errno 2] No such file or directory: 'inboedel_responses_parsed.xlsx'

In [None]:
# Preprocess files
 
reis_gt["source"] = reis_gt["source"].str.strip().str.lower()
reis_gt["source"] = reis_gt["source"].replace({
    "a.s.r_vp_dr_2024": "asr_2024",
    "ik_kies_zelf_(dr_2018)": "asr_ikz_2018"
})
 
# Add empty polis_versie to reis_parsed
if "polis_versie" not in reis_parsed.columns:
    reis_parsed["polis_versie"] = ""
 

In [3]:
# Merge all parsed and GT
parsed_all = pd.concat([inboedel_parsed, reis_parsed], ignore_index=True)
gt_all = pd.concat([inboedel_gt, reis_gt], ignore_index=True)

# Normalize columns
parsed_all["product"] = parsed_all["product"].str.strip().str.lower()
gt_all["product"] = gt_all["product"].str.strip().str.lower()

In [4]:
# Separate AEGON and ASR GT
aegon_gt = gt_all[gt_all["source"].str.contains("aegon", case=False)].copy()
asr_gt = gt_all[gt_all["source"].str.contains("a.s.r", case=False) | gt_all["source"].str.contains("asr_", case=False)].copy()
 
# Drop duplicates
aegon_gt = aegon_gt.drop_duplicates(subset=["vraag", "product", "dekking", "polis_versie"])
asr_gt = asr_gt.drop_duplicates(subset=["vraag", "product", "dekking", "type_klant"])
 
# Split parsed by product
inboedel_df = parsed_all[parsed_all["product"].str.contains("inboedel")].copy()
reis_df = parsed_all[parsed_all["product"].str.contains("reis")].copy()
 

In [5]:
# AEGON merge
inboedel_merged = inboedel_df.merge(
    aegon_gt[["vraag", "product", "dekking", "polis_versie", "gold_article_id", "gold_answer", "source"]],
    how="left",
    left_on=["question", "product", "dekking", "polis_versie"],
    right_on=["vraag", "product", "dekking", "polis_versie"]
).rename(columns={
    "gold_article_id": "aegon_gold_article_id",
    "gold_answer": "aegon_gold_answer",
    "source": "source_gt_aegon"
})
 
reis_merged = reis_df.merge(
    aegon_gt[["vraag", "product", "dekking", "gold_article_id", "gold_answer", "source"]],
    how="left",
    left_on=["question", "product", "dekking"],
    right_on=["vraag", "product", "dekking"]
).rename(columns={
    "gold_article_id": "aegon_gold_article_id",
    "gold_answer": "aegon_gold_answer",
    "source": "source_gt_aegon"
})
 
# Combine inboedel and reis back
combined = pd.concat([inboedel_merged, reis_merged], ignore_index=True)

In [6]:
# ASR merge (same logic for both products)
combined = combined.merge(
    asr_gt[["vraag", "product", "dekking", "type_klant", "gold_article_id", "gold_answer", "source"]],
    how="left",
    left_on=["question", "product", "dekking", "type_klant"],
    right_on=["vraag", "product", "dekking", "type_klant"]
).rename(columns={
    "gold_article_id": "asr_gold_article_id",
    "gold_answer": "asr_gold_answer",
    "source": "source_gt_asr"
})

In [7]:
# Clean up
combined.drop(columns=["vraag_x", "vraag_y"], inplace=True)
# Save
output_path = "merged_all.xlsx"
combined.to_excel(output_path, index=False)
output_path

'merged_all.xlsx'