In [None]:
import pandas as pd
from datetime import datetime

def match_and_format_data(part_a, part_b):
    """
    1、输入数据为 part_a、part_b，结构为：

    part_a={
        "date":["data1","data2","data3"],
        "numbers":[number1,number2,number3]
        }
    part_b={
        "date":["data1","data2","data3"],
        "numbers":[number2,number3，number4]
        }

    2、求两个数据的最佳匹配，匹配规则如下：
    - 输出为表格，有四列（part_a.date, part_a,numbers,part_b,date,part_b.numbers），各自按日期升序排列
    - 求 part_a、part_b 中 numbers 完全匹配的个数最多有多少，限制条件为 part_a.date >= part_b.date 且 part_a.date 和 part_b.date 为同一年
    - 匹配失败的按日期插入在匹配成功的前面或后面，要求即 part_a.date >= part_b.date 且 part_a.date 和 part_b.date 为同一年
    """
    # 处理part_a数据，转换为包含日期字符串、datetime对象、数字和是否使用标志的字典列表
    a_entries = []
    for date_str, num in zip(part_a["date"], part_a["numbers"]):
        dt = datetime.strptime(date_str, "%Y-%m-%d")
        a_entries.append({"date_str": date_str, "dt": dt, "num": num, "used": False})

    # 处理part_b数据
    b_entries = []
    for date_str, num in zip(part_b["date"], part_b["numbers"]):
        dt = datetime.strptime(date_str, "%Y-%m-%d")
        b_entries.append({"date_str": date_str, "dt": dt, "num": num, "used": False})

    used_pairs = []  # 存储匹配成功的对

    # 找出part_a和part_b中共有的数字
    a_nums = {entry["num"] for entry in a_entries}
    b_nums = {entry["num"] for entry in b_entries}
    common_nums = a_nums & b_nums

    # 对每个共有数字进行处理，寻找最佳匹配
    for num in common_nums:
        # 获取该数字在part_a中未使用的条目，按日期升序排列
        a_list = sorted([e for e in a_entries if e["num"] == num and not e["used"]], key=lambda x: x["dt"])
        # 获取该数字在part_b中未使用的条目，按日期升序排列
        b_list = sorted([e for e in b_entries if e["num"] == num and not e["used"]], key=lambda x: x["dt"])

        i = j = 0
        while i < len(a_list) and j < len(b_list):
            a_entry = a_list[i]
            b_entry = b_list[j]
            if a_entry["dt"] >= b_entry["dt"]:
                # 标记为已使用，并记录配对
                a_entry["used"] = True
                b_entry["used"] = True
                used_pairs.append((a_entry, b_entry))
                i += 1
                j += 1
            else:
                i += 1

    # 处理匹配的行，按part_a的日期排序
    matched_rows = []
    for a_entry, b_entry in used_pairs:
        matched_rows.append({"a_date_str": a_entry["date_str"], "a_dt": a_entry["dt"], "b_date_str": b_entry["date_str"], "b_dt": b_entry["dt"], "num": a_entry["num"]})
    # 按part_a的日期升序排序
    matched_rows_sorted = sorted(matched_rows, key=lambda x: x["a_dt"])
    matched_rows_final = [{"part_a.date": row["a_date_str"], "part_a.numbers": row["num"], "part_b.date": row["b_date_str"], "part_b.numbers": row["num"], "sort_date": row["a_dt"]} for row in matched_rows_sorted]  # 添加排序字段

    # 处理未匹配的part_a条目，按日期升序
    unmatched_a = [e for e in a_entries if not e["used"]]
    unmatched_a_sorted = sorted(unmatched_a, key=lambda x: x["dt"])
    unmatched_a_rows = [{"part_a.date": e["date_str"], "part_a.numbers": e["num"], "part_b.date": None, "part_b.numbers": None, "sort_date": e["dt"]} for e in unmatched_a_sorted]  # 添加排序字段

    # 处理未匹配的part_b条目，按日期升序
    unmatched_b = [e for e in b_entries if not e["used"]]
    unmatched_b_sorted = sorted(unmatched_b, key=lambda x: x["dt"])
    unmatched_b_rows = [{"part_a.date": None, "part_a.numbers": None, "part_b.date": e["date_str"], "part_b.numbers": e["num"], "sort_date": e["dt"]} for e in unmatched_b_sorted]  # 添加排序字段

    # 合并所有行并按排序字段升序排列
    final_rows = unmatched_a_rows + matched_rows_final + unmatched_b_rows
    final_rows_sorted = sorted(final_rows, key=lambda x: x["sort_date"])

    # 转换为DataFrame，并移除排序字段
    df = pd.DataFrame(final_rows_sorted, columns=["part_a.date", "part_a.numbers", "part_b.date", "part_b.numbers"])

    return df


# 示例数据
part_a={
    "date":["2023-11-21","2023-11-30","2023-12-11","2023-12-25","2024-01-06","2024-01-25","2024-03-06","2024-03-21","2024-04-25","2024-04-30","2024-05-07","2024-05-16","2024-05-28","2024-06-05","2024-06-08","2024-07-12","2024-07-23","2024-07-31","2024-08-12","2024-08-15","2024-08-17","2024-08-20","2024-08-31","2024-09-08","2024-09-10","2024-09-25","2024-09-26","2024-09-30","2024-10-07","2024-10-09","2024-10-11","2024-10-30","2024-11-07","2024-11-14","2024-11-19","2024-12-06","2024-12-14","2024-12-29","2024-12-31"],
    "numbers":[250,150,100,90,390,156,80,174,70,100,100,65,115,100,160,70,50,55,35,30,40,50,25,65,100,45,55,20,20,100,80,70,50,60,150,70,76,50,76]
    }
part_b={
    "date":["2023-11-18","2023-11-28","2023-11-29","2023-11-30","2023-12-5","2023-12-9","2023-12-13","2023-12-23","2023-12-27","2024-1-4","2024-1-5","2024-1-8","2024-1-9","2024-1-10","2024-1-12","2024-1-17","2024-1-20","2024-3-5","2024-3-6","2024-3-8","2024-3-19","2024-3-21","2024-3-22","2024-4-2","2024-4-22","2024-4-23","2024-4-28","2024-5-7","2024-5-18","2024-5-27","2024-6-1","2024-6-5","2024-6-6","2024-6-12","2024-7-6","2024-7-10","2024-7-20","2024-8-1","2024-8-10","2024-8-15","2024-8-16","2024-8-30","2024-9-9","2024-9-11","2024-9-25","2024-9-29","2024-10-6","2024-10-9","2024-10-28","2024-11-6","2024-11-13","2024-11-18","2024-12-5","2024-12-12","2024-12-28","2024-12-29"],
    "numbers":[1350,300,150,200,150,100,100,90,100,200,994,200,100,100,100,240,306,280,140,136,150,24,65,214,145,70,310,295,65,115,90,100,180,160,20,70,50,55,35,40,80,25,65,100,100,20,120,80,70,50,60,150,70,76,50,76]
    }

result = match_and_format_data(part_a, part_b)
print(result)

   part_a.date  part_a.numbers part_b.date  part_b.numbers
0         None             NaN  2023-11-18          1350.0
1   2023-11-21           250.0        None             NaN
2         None             NaN  2023-11-28           300.0
3   2023-11-30           150.0  2023-11-29           150.0
4         None             NaN  2023-11-30           200.0
..         ...             ...         ...             ...
61  2024-11-19           150.0   2023-12-5           150.0
62  2024-12-06            70.0   2024-12-5            70.0
63  2024-12-14            76.0  2024-12-12            76.0
64  2024-12-29            50.0  2024-12-28            50.0
65  2024-12-31            76.0  2024-12-29            76.0

[66 rows x 4 columns]


In [None]:
part_a={
    "date":["2024-11-13","2024-11-22","2024-12-06","2024-12-07","2024-12-20","2024-12-22","2024-12-29","2024-12-31"],
    "numbers":[797,453,150,250,210,130,60,120]
    }
part_b={
    "date":["2024-10-31","2024-11-20","2024-11-21","2024-12-4","2024-12-6","2024-12-19","2024-12-21","2024-12-28","2024-12-29",],
    "numbers":[800,450,3,150,250,210,130,60,120]
    }

result = match_and_format_data(part_a, part_b)
print(result)

In [None]:
part_a={
    "date":["2024-01-27","2024-11-13","2024-11-29","2024-12-06","2024-12-07","2024-12-21","2024-12-22","2024-12-26"],
    "numbers":[300,650,70,100,100,20,250,119]
    }
part_b={
    "date":["2024-1-23","2024-11-11","2024-11-28","2024-12-4","2024-12-20","2024-12-24","2024-12-29"],
    "numbers":[1600,650,70,200,270,120,1]
    }

result = match_and_format_data(part_a, part_b)
print(result)

In [None]:
part_a={
    "date":["2024-01-27","2024-11-13","2024-12-01","2024-12-04","2024-12-10","2024-12-26"],
    "numbers":[300,650,70,60,310,90]
    }
part_b={
    "date":["2024-1-23","2024-11-11","2024-11-29","2024-12-9","2024-12-25"],
    "numbers":[1600,650,130,310,90]
    }

result = match_and_format_data(part_a, part_b)
print(result)

In [None]:
part_a={
    "date":["2024-04-04","2024-04-21","2024-04-22","2024-05-16","2024-06-06","2024-07-12","2024-07-26","2024-08-02","2024-08-03","2024-08-08","2024-08-17","2024-08-20","2024-09-04","2024-09-06","2024-09-21","2024-09-26","2024-10-12","2024-10-20","2024-10-27","2024-10-30","2024-11-08","2024-11-12","2024-11-19"],
    "numbers":[60,15,15,40,50,25,20,8,20,12,10,20,20,20,40,10,9,21,20,20,30,20,30]
    }
part_b={
    "date":["2024-4-2","2024-4-17","2024-4-19","2024-4-23","2024-4-28","2024-5-14","2024-5-16","2024-5-17","2024-5-20","2024-6-6","2024-6-11","2024-7-6","2024-7-10","2024-7-25","2024-8-1","2024-8-2","2024-8-3","2024-8-6","2024-9-21","2024-9-25"],
    "numbers":[280,90,30,20,27,60,20,3,60,120,50,10,25,20,10,8,10,12,40,10]
    }

result = match_and_format_data(part_a, part_b)
print(result)

In [None]:
part_a={
    "date":["2023-11-21","2023-11-30","2023-12-11","2023-12-25","2024-01-06","2024-01-25","2024-03-06","2024-03-21","2024-04-25","2024-04-30","2024-05-07","2024-05-16","2024-05-28","2024-06-05","2024-06-08","2024-07-12","2024-07-23","2024-07-31","2024-08-12","2024-08-15","2024-08-17","2024-08-20","2024-08-31","2024-09-08","2024-09-10","2024-09-25","2024-09-26","2024-09-30","2024-10-07","2024-10-09","2024-10-11","2024-10-30","2024-11-07","2024-11-14","2024-11-19","2024-12-06","2024-12-14","2024-12-29","2024-12-31"],
    "numbers":[250,150,100,90,390,156,80,174,70,100,100,65,115,100,160,70,50,55,35,30,40,50,25,65,100,45,55,20,20,100,80,70,50,60,150,70,76,50,76]
    }
part_b={
    "date":["2023-11-18","2023-11-28","2023-11-29","2023-11-30","2023-12-5","2023-12-9","2023-12-13","2023-12-23","2023-12-27","2024-1-4","2024-1-5","2024-1-8","2024-1-9","2024-1-10","2024-1-12","2024-1-17","2024-1-20","2024-3-5","2024-3-6","2024-3-8","2024-3-19","2024-3-21","2024-3-22","2024-4-2","2024-4-22","2024-4-23","2024-4-28","2024-5-7","2024-5-18","2024-5-27","2024-6-1","2024-6-5","2024-6-6","2024-6-12","2024-7-6","2024-7-10","2024-7-20","2024-8-1","2024-8-10","2024-8-15","2024-8-16","2024-8-30","2024-9-9","2024-9-11","2024-9-25","2024-9-29","2024-10-6","2024-10-9","2024-10-28","2024-11-6","2024-11-13","2024-11-18","2024-12-5","2024-12-12","2024-12-28","2024-12-29"],
    "numbers":[1350,300,150,200,150,100,100,90,100,200,994,200,100,100,100,240,306,280,140,136,150,24,65,214,145,70,310,295,65,115,90,100,180,160,20,70,50,55,35,40,80,25,65,100,100,20,120,80,70,50,60,150,70,76,50,76]
    }

result = match_and_format_data(part_a, part_b)
print(result)

In [2]:
from IPython.display import display, HTML

display(HTML(result.to_html()))

Unnamed: 0,part_a.date,part_a.numbers,part_b.date,part_b.numbers
0,,,2023-11-18,1350.0
1,2023-11-21,250.0,,
2,,,2023-11-28,300.0
3,2023-11-30,150.0,2023-11-29,150.0
4,,,2023-11-30,200.0
5,2023-12-11,100.0,2023-12-9,100.0
6,2023-12-25,90.0,2023-12-23,90.0
7,,,2024-1-4,200.0
8,,,2024-1-5,994.0
9,2024-01-06,390.0,,
