### 得到每个每个图片所构成的列表 

In [None]:
import os
import json

raw_path = "/mnt/pfs-guan-ssai/cv/panxuhao/misc/playground/physionet.org/files/mimic-cxr-jpg/2.0.0/files"

def get_files(path):
    files = []
    for root, dirs, fs in os.walk(path):
        for f in fs:
            if f.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp')):
                files.append(os.path.join(root, f))
    return files

files = get_files(raw_path)
print(len(files))

with open("/mnt/pfs-mc0p4k/cv/team/panxuhao/playground/miccai24_cxr_lt/code/dataprocesser/anno.json", "w") as f:
    json.dump(files, f, ensure_ascii=False, indent=4)


### 读取数据

In [1]:
import os
import json
import pandas as pd

# 读取数据
raw_path = "/mnt/pfs-guan-ssai/cv/panxuhao/misc/playground/miccai24-cxr-lt/task1_development_starting_kit/train_labeled.csv"
class_path = "/mnt/pfs-guan-ssai/cv/panxuhao/misc/playground/miccai24-cxr-lt/task1_development_starting_kit/CLASSES.txt"

data = pd.read_csv(raw_path)
classes = []
with open(class_path, "r") as f:
    for line in f.readlines():
        classes.append(line.strip())


#### 得到所有图片的类别以及保存 meta 信息

In [19]:
meta_info = {}
meta_info['metainfo'] = {"classes": classes}
data_list = []

for idx, raw in data.iterrows():
    image_path = os.path.join("/mnt/pfs-guan-ssai/cv/panxuhao/misc/playground/physionet.org/files/mimic-cxr-jpg/2.0.0", raw['fpath'])
    out = list(raw.values[6:])
    indices = [index for index, value in enumerate(out) if value == 1]
    data_list.append({"img_path": image_path,'gt_label': indices})

meta_info['data_list'] = data_list

In [20]:
with open("data_unbalanced_metainfo.json", "w") as f:
    json.dump(meta_info, f, ensure_ascii=False, indent=4)

#### 统计每个类别不均衡占比

In [14]:
# 初始化空的数据框
summary_df = pd.DataFrame(columns=['Class', '0_count', '1_count', '1_ratio'])

# 统计每个类的0和1的数量和1的占比
rows = []
for c in classes:
    cl = data[c].value_counts()
    count_0 = cl.get(0, 0)
    count_1 = cl.get(1, 0)
    total = count_0 + count_1
    ratio_1 = count_1 / total if total > 0 else 0
    row = {'Class': c, '0_count': count_0, '1_count': count_1, '1_ratio': ratio_1}
    rows.append(row)

summary_df = pd.DataFrame(rows)

# 显示数据框
print(summary_df)

                         Class  0_count  1_count   1_ratio
0                   Adenopathy   255462     3409  0.013169
1                  Atelectasis   193495    65376  0.252543
2                  Azygos Lobe   258672      199  0.000769
3   Calcification of the Aorta   254632     4239  0.016375
4                 Cardiomegaly   184133    74738  0.288708
5            Clavicle Fracture   258703      168  0.000649
6                Consolidation   243500    15371  0.059377
7                        Edema   221615    37256  0.143917
8                    Emphysema   255210     3661  0.014142
9   Enlarged Cardiomediastinum   229243    29628  0.114451
10                    Fibrosis   257702     1169  0.004516
11                     Fissure   256068     2803  0.010828
12                    Fracture   247303    11568  0.044686
13                   Granuloma   255906     2965  0.011454
14                      Hernia   254885     3986  0.015398
15           Hydropneumothorax   258225      646  0.0024

### 平衡每个类别的数据占比

In [15]:
import pandas as pd
from sklearn.utils import resample

# 读取数据
data = pd.read_csv(raw_path)
classes = []
with open(class_path, "r") as f:
    for line in f.readlines():
        classes.append(line.strip())

# 设定上限和下限
upper_limit = 5000
lower_limit = 2000

# 初始化空的数据框
balanced_data = pd.DataFrame()

for c in classes:
    class_data = data[data[c] == 1]
    count = class_data.shape[0]
    
    if count > upper_limit:
        class_data = resample(class_data, replace=False, n_samples=upper_limit, random_state=42)
    elif count < lower_limit:
        class_data = resample(class_data, replace=True, n_samples=lower_limit, random_state=42)
    
    balanced_data = pd.concat([balanced_data, class_data])

# 显示平衡后的数据分布
balanced_summary_df = balanced_data[classes].sum().reset_index()
balanced_summary_df.columns = ['Class', 'Count']

# 计算新的1的占比
balanced_summary_df['1_ratio'] = balanced_summary_df['Count'] / len(balanced_data)

# 显示平衡后的结果
print(balanced_summary_df)


                         Class  Count   1_ratio
0                   Adenopathy   5935  0.042353
1                  Atelectasis  44384  0.316728
2                  Azygos Lobe   2139  0.015264
3   Calcification of the Aorta   6696  0.047783
4                 Cardiomegaly  45111  0.321916
5            Clavicle Fracture   2190  0.015628
6                Consolidation  14820  0.105757
7                        Edema  24477  0.174670
8                    Emphysema   6896  0.049210
9   Enlarged Cardiomediastinum  23515  0.167805
10                    Fibrosis   2849  0.020331
11                     Fissure   5258  0.037521
12                    Fracture  16751  0.119536
13                   Granuloma   5048  0.036023
14                      Hernia   6566  0.046855
15           Hydropneumothorax   2711  0.019346
16                  Infarction   2617  0.018675
17                Infiltration   9889  0.070569
18                    Kyphosis   2559  0.018261
19           Lobar Atelectasis   2096  0

In [16]:
balanced_data.to_csv("/mnt/pfs-mc0p4k/cv/team/panxuhao/playground/miccai24_cxr_lt/code/dataprocesser/balanced_5000_2000.csv", index=False)

In [17]:
meta_info = {}
meta_info['metainfo'] = {"classes": classes}
data_list = []

for idx, raw in balanced_data.iterrows():
    image_path = os.path.join("/mnt/pfs-guan-ssai/cv/panxuhao/misc/playground/physionet.org/files/mimic-cxr-jpg/2.0.0", raw['fpath'])
    out = list(raw.values[6:])
    indices = [index for index, value in enumerate(out) if value == 1]
    data_list.append({"img_path": image_path,'gt_label': indices})

meta_info['data_list'] = data_list

In [18]:
with open("balanced_5000_2000_metainfo.json", "w") as f:
    json.dump(meta_info, f, ensure_ascii=False, indent=4)