# 本部分代码用于处理完整的数据集的数据以提取出符合要求的数据

In [26]:
import pandas as pd
import json
import os
from itertools import product
import numpy as np
import random
# 初始化随机种子保证可复现性
random_seed = 43
random.seed(random_seed)
np.random.seed(random_seed)

### 加载数据

In [None]:
split_data = r'./your_path/mimic-cxr-2.0.0-split.csv'
meta = r'./your_path/mimic-cxr-2.0.0-metadata.csv'
txt = r'./your_path/secction_csv/mimic_cxr_all_sectioned.csv'
# 加载 CSV 文件
split_df = pd.read_csv(split_data)  # 包含 dicom_id, study_id, subject_id, split
sectioned_df = pd.read_csv(txt)  # 包含 study, findings, indication
metadata_df = pd.read_csv(meta)  # 包含 dicom_id, subject_id, study_id, ViewPosition
sectioned_df['study'] = sectioned_df['study'].str[1:].astype(int)  # 去掉前面的 's'
# sectioned_df.head()

### 合并metadata和sectioned数据

In [23]:
merged_df = pd.merge(metadata_df, sectioned_df[['study', 'impression','findings', 'indication', 'technique','comparison' ]], how='left', left_on='study_id', right_on='study')
# merged_df.head()

### 合并split数据

In [24]:
final_df = pd.merge(merged_df, split_df[['dicom_id', 'study_id', 'subject_id', 'split']], on=['dicom_id', 'study_id', 'subject_id'], how='left')
# final_df.head()

### 对数据进行进一步的处理，排序，格式变换，选取图像等操作

In [25]:
# 按subject_id分组，按时间排序study
final_df = final_df.sort_values(["subject_id", "StudyDate","StudyTime"])
# 标记每个study的前一个study（Prior）
final_df["prior_study_id"] = final_df.groupby("subject_id")["study_id"].shift(1)

In [26]:
# final_df[final_df["subject_id"] == 10002428]

In [None]:
final_df = final_df[['dicom_id', 'subject_id', 'study_id', "StudyDate", 'StudyTime', 'split','ViewPosition','findings','indication','comparison','technique','prior_study_id', 'ViewCodeSequence_CodeMeaning']]
# 构建图片的 path 列
final_df['dicom_id'] = final_df.apply(
    lambda row: os.path.join(f"p{str(row['subject_id'])[:2]}", f"p{row['subject_id']}", f"s{row['study_id']}", f"{row['dicom_id']}.jpg"),
    axis=1
)
 # 使用正则表达式，将那些只由下划线和点组成的字符串替换成 np.nan
final_df = final_df.replace({None: np.nan})
final_df = final_df.replace(r'^[_., ]+$', np.nan, regex=True)
final_final_df = final_df.replace(r'^[_.\s]*(None)[_.\s]*$', np.nan, regex=True)        #这部分其实没有用到，也就是说None等没有替换
# final_df.head()

In [12]:
# 定义正面和侧面的取值范围
frontal_codes = {"antero-posterior", "postero-anterior"}
frontal_positions = {"AP", "PA", "AP AXIAL", "PA LLD", "PA RLD","AP LLD"}

lateral_codes = {"lateral", "left lateral"}
lateral_positions = {"LATERAL", "LL", "XTABLE LATERAL"}

In [None]:
def process_study(group):
    """处理单个study，生成候选数据行"""
    current_study_id = group.name
    subject_id = group["subject_id"].iloc[0]
    
    # 提取当前study的Frontal/Lateral影像
    current_frontal = group[(group["ViewCodeSequence_CodeMeaning"].isin(frontal_codes)) | (group["ViewPosition"].isin(frontal_positions))]["dicom_id"].tolist()
    current_lateral = group[(group["ViewCodeSequence_CodeMeaning"].isin(lateral_codes)) | (group["ViewPosition"].isin(lateral_positions))]["dicom_id"].tolist()
    
    # 必须存在Frontal影像
    if not current_frontal:
        return pd.DataFrame()  # 跳过无效数据
    # 为每个frontal随机分配一个lateral
    lateral_mapping = {
        frontal: random.choice(current_lateral) if current_lateral else None
        for frontal in current_frontal
    }
    # print("__________________________________")
    # print(group.columns)
    # 提取Prior Study信息
    prior_study_id = group["prior_study_id"].iloc[0]
    prior_data = {
        "frontal": [],
        "reports": []
    }
    # print("__________________________________")
    if pd.notna(prior_study_id):
        prior_study = final_df[final_df["study_id"] == prior_study_id]
        if not prior_study.empty:
            # 提取所有prior frontal影像
            prior_frontal = prior_study[(prior_study["ViewCodeSequence_CodeMeaning"].isin(frontal_codes)) | (prior_study["ViewPosition"].isin(frontal_positions))]["dicom_id"].tolist()
            # 提取报告信息并组合
            report_components = []
            for field in ["indication", "technique", "comparison", "findings"]:
                value = prior_study[field].iloc[0] + " "
                if pd.notna(value):
                    report_components.append(f"{field.upper()}: {value}")
            prior_report = "\n\n".join(report_components) if report_components else None
            if prior_report:
                print("报告非空")
            else:
                print("报告为空")
            # 建立映射关系
            prior_data["frontal"] = prior_frontal
            prior_data["reports"] = [prior_report] * len(prior_frontal)
        # else:
        #     prior_frontal, prior_report = [], None
    # ======================== 生成数据组合 ========================
    rows = []
    for frontal in current_frontal:
        # 当前影像组合
        current_pair = {
            "Current_frontal_dicom_id": frontal,
            "Current_lateral_dicom_id": lateral_mapping[frontal]
        }
        # 先前影像处理
        if prior_data["frontal"]:
            for prior_frontal, prior_report in zip(prior_data["frontal"], prior_data["reports"]):
                rows.append({
                    **current_pair,
                    "Prior_frontal_dicom_id": prior_frontal,
                    "subject_id": subject_id,
                    "study_id": current_study_id,
                    "prior_report": prior_report,
                    "findings": group["findings"].iloc[0],
                    "indication": group["indication"].iloc[0],
                    "comparison": group["comparison"].iloc[0],
                    "technique": group["technique"].iloc[0],
                    "split": group["split"].iloc[0]
                })
        else:
            rows.append({
                **current_pair,
                "Prior_frontal_dicom_id": None,
                "subject_id": subject_id,
                "study_id": current_study_id,
                "prior_report": None,
                "findings": group["findings"].iloc[0],
                "indication": group["indication"].iloc[0],
                "comparison": group["comparison"].iloc[0],
                "technique": group["technique"].iloc[0],
                "split": group["split"].iloc[0]
            })
    
    return pd.DataFrame(rows)

# 按study分组处理
final_df = final_df.groupby("study_id",sort=False, group_keys=False).apply(process_study)
final_df = final_df[
    final_df["findings"].notna() & 
    final_df["Current_frontal_dicom_id"].notna()
]

### 修改格式

In [17]:
# 修改 'report' 列的数据
final_df['findings'] = final_df['findings'].str.replace('\n', '').replace('\r', '')
final_df['findings'] = final_df['findings'].str.replace("  ", " ")
final_df['indication'] = final_df['indication'].str.replace('\n', '').replace('\r', '')
final_df['indication'] = final_df['indication'].str.replace("  ", " ")
final_df['prior_report'] = final_df["prior_report"].str.replace('\n', '').replace('\r', '')
final_df['prior_report'] = final_df["prior_report"].str.replace("  ", " ")
final_df['comparison'] = final_df["comparison"].str.replace('\n', '').replace('\r', '')
final_df['comparison'] = final_df["comparison"].str.replace("  ", " ")
final_df['technique'] = final_df["technique"].str.replace('\n', '').replace('\r', '')
final_df['technique'] = final_df["technique"].str.replace("  ", " ")

In [None]:
print(len(final_df))
final_df.head(20)

### 保证Prior_report以及prior_frontal有效

### 将时间信息重新添加

In [None]:
print(len(metadata_df))

In [20]:
metadata_df['dicom_id'] = metadata_df.apply(
    lambda row: os.path.join(f"p{str(row['subject_id'])[:2]}", f"p{row['subject_id']}", f"s{row['study_id']}", f"{row['dicom_id']}.jpg"),
    axis=1
)
# metadata_df.head()

In [22]:
final_df = pd.merge(final_df, metadata_df[['StudyDate', 'StudyTime','dicom_id']], how='left', left_on='Current_frontal_dicom_id', right_on='dicom_id')
final_df.rename(columns={'StudyDate': 'C_Date', 'StudyTime': 'C_Time'}, inplace=True)
final_df = final_df[['Current_frontal_dicom_id','Current_lateral_dicom_id','Prior_frontal_dicom_id', 'subject_id', 'study_id', 'prior_report' ,'findings','indication','comparison','technique','C_Date','C_Time','split']]

In [None]:
final_df.head()
len(final_df)

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)

In [24]:
final_df = pd.merge(final_df, metadata_df[['StudyDate', 'StudyTime','dicom_id']], how='left', left_on='Prior_frontal_dicom_id', right_on='dicom_id')
final_df.rename(columns={'StudyDate': 'P_Date', 'StudyTime': 'P_Time'}, inplace=True)
final_df = final_df[['Current_frontal_dicom_id','Current_lateral_dicom_id','Prior_frontal_dicom_id', 'subject_id', 'study_id', 'prior_report' ,'findings','indication','comparison','technique','C_Date','C_Time','P_Date','P_Time','split']]

In [28]:
final_df['study_id'] = final_df['study_id'].apply(lambda x: 's' + str(x))  # 这里添加了3个"s"

# 按 split 列进行分组：训练集、验证集、测试集
train_df = final_df[final_df['split'] == 'train']
val_df = final_df[final_df['split'] == 'validate']


# 输出数据框的前几行
# print(train_df.head())
# print(test_df.head())
# print(val_df.head())

In [None]:
# 保存为 CSV 文件
train_df.to_csv('./your_path/train_data.csv', index=False)
val_df.to_csv('./your_path/val_data.csv', index=False)


print("Data processing complete. The datasets have been saved.")

### 检验时间顺序

In [None]:
# len(val_df)
df = pd.read_csv('./datasets/MIMIC-complete/processed_data_MARIA1/test_data.csv', encoding='utf-8')
df["study_id"].nunique()
styD = list(final_df["StudyDate"].unique())
styT = list(final_df["StudyTime"].unique())
# for i in range(final_df["StudyDate"].nunique()):
#     print(f"studyDate:{styD[i]},studyTime:{styT[i]}")
metadata_df["StudyTime"].unique()

In [None]:
study_time_counts = metadata_df.groupby(['subject_id','study_id','StudyDate'])['StudyTime'].nunique()
inconsistent_dates = study_time_counts[study_time_counts > 1].index
inconsistent_dates

In [None]:
import pandas as pd

# 假设您的 DataFrame 名为 metadata_df，包含 'subject_id'、'StudyDate' 和 'StudyTime' 列
grouped = metadata_df.groupby(['subject_id', 'StudyDate'])['StudyTime'].nunique().reset_index()

# 筛选出 'StudyTime' 不唯一的组合
inconsistent = grouped[grouped['StudyTime'] > 1]

# 合并原始数据以查看具体记录
inconsistent_records = pd.merge(metadata_df, inconsistent[['subject_id', 'StudyDate']], on=['subject_id', 'StudyDate'], how='inner')

print(inconsistent_records)


In [None]:
study_time_counts = metadata_df.groupby(['subject_id','StudyDate'])['StudyTime'].nunique()
inconsistent_dates = study_time_counts[study_time_counts > 1].index
inconsistent_samples = metadata_df[metadata_df['StudyDate'].isin(inconsistent_dates)]
for date in inconsistent_dates:
    times = metadata_df[metadata_df['StudyDate'] == date]['StudyTime'].unique()
    print(f"StudyDate: {date}, Unique StudyTimes: {times}")


# # 假设您的 DataFrame 名为 metadata_df，包含 'subject_id'、'StudyDate' 和 'StudyTime' 列
# grouped = metadata_df.groupby(['subject_id', 'StudyDate'])['StudyTime'].nunique().reset_index()
# inconsistent = grouped[grouped['StudyTime'] > 1]
# print(inconsistent)
# metadata_df['time_count'] = metadata_df.groupby(['subject_id', 'StudyDate'])['StudyTime'].transform('nunique')

# # 筛选出 StudyTime 数量大于1的记录
# inconsistent_records = metadata_df[metadata_df['time_count'] > 1]

# # 删除辅助列
# inconsistent_records = inconsistent_records.drop(columns='time_count')
# inconsistent_records = inconsistent_records[['subject_id',  'study_id', 'StudyDate','StudyTime']]
# # print(f"{inconsistent_records["subject_id"]},{inconsistent_records["StudyDate"]},{inconsistent_records["StudyTime"]}")

# print(inconsistent_records)


### 抽取数据集

In [None]:
from typing import final
import pandas as pd
df = pd.read_csv('./your_path/train_data.csv', encoding='utf-8')

class1 = df[(df['Prior_frontal_dicom_id'].isnull()) & (df['Current_lateral_dicom_id'].isnull())]    # (1)
class2 = df[(df['Prior_frontal_dicom_id'].notnull()) & (df['Current_lateral_dicom_id'].isnull())]  # (2)
class3 = df[(df['Prior_frontal_dicom_id'].isnull()) & (df['Current_lateral_dicom_id'].notnull())]  # (3)
class4 = df[(df['Prior_frontal_dicom_id'].notnull()) & (df['Current_lateral_dicom_id'].notnull())] # (4)

# print(len(class1))
# 计算各类数据占总数据的比例
total = len(df)
prop1 = len(class1) / total
prop2 = len(class2) / total
prop3 = len(class3) / total
prop4 = len(class4) / total



print(f"数据总量:{total}")
print("类别 1 (Prior为空, Current为空): 数据量为{}占比为{:.2%}".format(len(class1),prop1))
print("类别 2 (Prior非空, Current为空): 数据量为{}占比为{:.2%}".format(len(class2),prop2))
print("类别 3 (Prior为空, Current非空): 数据量为{}占比为{:.2%}".format(len(class3),prop3))
print("类别 4 (Prior非空, Current非空): 数据量为{}占比为{:.2%}".format(len(class4),prop4))

# 设定新数据集的总样本数，例如 new_total = 1000
new_total = 30000

# 计算每类需要抽样的样本数（这里使用 int 直接取整，也可以用 round 处理四舍五入）
n_class1 = int(prop1 * new_total)
n_class2 = int(prop2 * new_total)
n_class3 = int(prop3 * new_total)
n_class4 = int(prop4 * new_total)
print(n_class1,n_class2,n_class3,n_class4,sep='\n')
# 根据各类比例从原始数据中抽样
new_class1 = class1.sample(n=n_class1, random_state=random_seed)
new_class2 = class2.sample(n=n_class2, random_state=random_seed)
new_class3 = class3.sample(n=n_class3, random_state=random_seed)
new_class4 = class4.sample(n=n_class4, random_state=random_seed)

# 合并抽样结果，构成新的数据集（可以打乱顺序）
new_dataset = pd.concat([new_class1, new_class2, new_class3, new_class4]).sample(frac=1, random_state=random_seed)
new_dataset = new_dataset.drop_duplicates(subset='study_id')
final_dataset = pd.concat([df1, new_dataset], ignore_index=True)
print("新数据集样本数:", len(final_dataset))
final_dataset.to_csv('./your_path', index=False, encoding='utf-8')


### 去除除了prior_report中除了Findings的其余部分

In [None]:
import pandas as pd
import re

def extract_findings_from_prior_report(prior_report_text):
    """
    从完整的prior_report文本中提取'FINDINGS:'部分后的内容。
    如果'FINDINGS:'不存在，或者其后内容为空，则返回空字符串。
    
    Args:
        prior_report_text (str): 原始的prior_report文本。
        
    Returns:
        str: 提取出的Findings部分内容，或空字符串。
    """
    if not isinstance(prior_report_text, str):
        return "" # 处理非字符串类型，如NaN，直接返回空字符串

    # 正则表达式解释：
    # r'FINDINGS:\s*'      - 匹配字面字符串 "FINDINGS:" 后跟任意数量的空白字符 (包括换行符)
    # (.*?)                 - 这是一个非贪婪捕获组，匹配任何字符 (除了换行符，如果使用re.DOTALL则包括换行符)
    #                       - `*?` 是非贪婪的，它会尽可能少地匹配，直到遇到下一个条件
    # (?=\b[A-Z_]+:|\Z)     - 这是一个正向先行断言 (Positive Lookahead)
    #   \b[A-Z_]+:          - 匹配一个单词边界 `\b` 后跟一个或多个大写字母或下划线 `[A-Z_]+`
    #                       - 再跟一个冒号 `:`。这用于匹配下一个报告节的标题 (如 TECHNIQUE:, COMPARISON:, IMPRESSION: 等)
    #   |                   - 或者
    #   \Z                  - 匹配字符串的结束。
    # re.DOTALL             - 使 '.' 匹配包括换行符在内的所有字符，确保能捕获跨行的Findings内容。

    # MIMIC-CXR 报告的常见结构，通常 FINDINGS 后直到 IMPRESSION, TECHNIQUE, COMPARISON 等下一个大写标题结束
    # 或者直到字符串结束
    match = re.search(r'FINDINGS:\s*(.*?)(?=\b[A-Z_]+:|\Z)', prior_report_text, re.DOTALL)
    
    if match:
        extracted_findings = match.group(1).strip() # .strip() 移除前后空白
        # 根据你之前提供的样本，可能会有一些___的占位符，可以进行清洗
        extracted_findings = extracted_findings.replace('___', '').strip() # 移除或替换占位符
        return extracted_findings
    else:
        # 如果没有找到 "FINDINGS:" 部分，或者匹配结果为空（例如：FINDINGS: 后无内容）
        return ""

# --- 如何在你的数据加载和处理流程中应用 ---

# 1. 假设你已经加载了 CSV 文件到 DataFrame (通常在你的主脚本开始时)
# 示例：
# df_train = pd.read_csv('train_data_5000_avg_43.csv')

# 2. 在你使用 Hugging Face `datasets` 库处理数据之前，对DataFrame进行预处理
# 这是在 `Hugging Face Dataset` 对象被创建或在你传递给 `map` 函数之前完成
# 因为 map 函数通常期望接收原始列名
#
# 注意：你需要将这一步放在你的数据加载流程中，例如：
# df_data = pd.read_csv('your_train_data.csv')
# df_data['prior_report'] = df_data['prior_report'].apply(extract_findings_from_prior_report)
# 然后再从 df_data 创建 Hugging Face Dataset 对象

# --- 演示如何使用此函数 ---
# 模拟一些 prior_report 文本
sample_prior_reports = [
    "INDICATION: Shortness of breath.TECHNIQUE: X-ray.COMPARISON: None.FINDINGS: Clear lungs. No effusions. Heart size normal.",
    "INDICATION: Chest pain.FINDINGS: Small right pleural effusion.IMPRESSION: Effusion noted.",
    "No Findings section here. Just some random text.",
    "FINDINGS: Patient is stable. No acute findings.",
    "INDICATION: Fever.FINDINGS: ", # FINDINGS 后没有内容
    None, # None值
    "Just findings at the start: This is the findings content.",
    "FINDINGS: A nodule in the left upper lobe. TECHNIQUE: CT chest." # 后面跟着另一个标题
]

# 创建一个临时的DataFrame来演示
# df_temp = pd.DataFrame({'prior_report': sample_prior_reports})

# print("--- 原始 prior_report ---")
# for i, text in enumerate(df_temp['prior_report']):
#     print(f"Sample {i}: {text}")
# print("\n" + "="*50 + "\n")

# # 应用函数来修改 prior_report 列
# df_temp['prior_report_processed'] = df_temp['prior_report'].apply(extract_findings_from_prior_report)

# print("--- 处理后的 prior_report (仅 Findings 部分) ---")
# for i, text in enumerate(df_temp['prior_report_processed']):
#     print(f"Sample {i}: {text if text else '[EMPTY]'}") # 打印空字符串时显示[EMPTY]方便查看

# print("\n--- 原始与处理后的对照 ---")
# print(df_temp[['prior_report', 'prior_report_processed']].to_string())

csv_file_path = "./datasets/MIMIC-complete/processed_data_MARIA2_new/train_data_60000_avg_43.csv" # 确保路径正确
df_data = pd.read_csv(csv_file_path)
print(df_data["prior_report"].head(10))
df_data['cleaned_prior_report'] = df_data['prior_report'].apply(extract_findings_from_prior_report)
print(df_data["cleaned_prior_report"].head(10))
df_data.to_csv("./datasets/MIMIC-complete/processed_data_MARIA2_new/train_data_60000_avg_43.csv", index=False, encoding='utf-8')

###   加入type

In [None]:
csv_file_path = "./your_path" # 确保路径正确
df_data = pd.read_csv(csv_file_path)
def determine_type(row):
        # print(row.get("Current_lateral_dicom_id"))
        has_cl = pd.notna(row.get("Current_lateral_dicom_id"))
        # print(has_cl)
        has_pf = pd.notna(row.get('Prior_frontal_dicom_id'))
        if has_cl and has_pf:
            return 'all'
        elif has_cl:
            return 'CFCL'
        elif has_pf:
            return 'CFPF'
        else:
            return 'CF'
df_data['type'] = df_data.apply(determine_type, axis=1)
df_data.to_csv("./your_Path", index=False, encoding='utf-8')

### 划分Rare和Common标签

In [None]:
import pandas as pd

def build_subgroups(csv_path, output_path, rare_q=0.25, common_q=0.75):
    """
    根据 CheXbert 标签划分 Rare / Common 亚组
    
    参数:
        csv_path: 输入 CSV 文件路径
        output_path: 输出带有 subgroup 的 CSV 文件路径
        rare_q: 罕见标签分位数阈值 (默认 25%)
        common_q: 常见标签分位数阈值 (默认 75%)
    """
    # 1. 读取数据
    df = pd.read_csv(csv_path)
    df = df.drop_duplicates(subset='Report Impression')
    print(len(df))
    # 假设第一列是文本，后面14列是CheXbert标签
    df.fillna(0, inplace=True)
    label_cols = df.columns[1:15]
    
    # 2. 计算每个标签的阳性比例
    prevalence = df[label_cols].apply(lambda col: (col == 1).sum() / len(col))
    print(prevalence)
    # 3. 确定 Rare / Common 标签
    rare_threshold = prevalence.quantile(rare_q)
    common_threshold = prevalence.quantile(common_q)

    print(rare_threshold,common_threshold,sep='\n')
    rare_labels = prevalence[prevalence <= rare_threshold].index.tolist()
    common_labels = prevalence[prevalence >= common_threshold].index.tolist()

    print("Rare labels:", rare_labels)
    print("Common labels:", common_labels)

    # 4. 定义函数来判断一个样本属于哪个组
    def assign_group(row):
        has_rare = any(row[label] == 1 for label in rare_labels)
        has_common = any(row[label] == 1 for label in common_labels)

        if has_rare and not has_common:
            return "Rare"
        elif has_common and not has_rare:
            return "Common"
        else:
            return "Other"  # 混合或无标签

    df["subgroup"] = df.apply(assign_group, axis=1)

    # 5. 保存结果
    df.to_csv(output_path, index=False)
    print(f"处理完成！结果已保存到 {output_path}")


build_subgroups("./your_path", "output_with_subgroups.csv")


### 定义Rara-subgroup


In [None]:
import pandas as pd

rare_labels= ['Consolidation', 'Pneumonia', 'Pneumothorax', 'Pleural Other']
common_labels= ['Cardiomegaly', 'Lung Opacity', 'Atelectasis', 'Support Devices']

df_test = pd.read_csv('./your_path.csv')
df_labeled = pd.read_csv('./your_path.csv')
# print(df_test['study_id'].head())
# print(df_labeled['study_id'].head())
# print(df_labeled['study_id']==df_test['study_id'])
# 定义一个函数来判断每行属于哪个亚组
def assign_subgroup(row):
    rare_flag = any(row[label] == 1 for label in rare_labels)
    common_flag = any(row[label] == 1 for label in common_labels)
    
    if rare_flag and not common_flag:
        return 'CF'
    elif common_flag and not rare_flag:
        return 'CFCL'
    elif rare_flag and common_flag:
        return 'CFPF'
    else:
        return 'all'

# 添加亚组列
df_labeled['Subgroup'] = df_labeled.apply(assign_subgroup, axis=1)

# 查看每个亚组的样本数
subgroup_counts = df_labeled['Subgroup'].value_counts()
print(subgroup_counts)
df_test['Subgroup'] = df_labeled['Subgroup']
# 如果需要，可以保存到新的 CSV
df_test.to_csv("./your_path.csv", index=False)