In [None]:
import json
import pandas as pd

base_path = "PATH_TO_DATASET"
data_path = f"{base_path}/MMLongBench-Doc/data/samples.json"
pdf_dir = f"{base_path}/MMLongBench-Doc/data/documents"
save_dir = f"{base_path}/MMLongBench-Doc/dataset"

In [2]:
with open(data_path, "r") as f:
    data = json.load(f)

data_df = pd.DataFrame(data)
print(data_df.shape)
print(data_df.head(2))
data_df.head(3)

(1082, 7)
                            doc_id                        doc_type  \
0  PH_2016.06.08_Economy-Final.pdf  Research report / Introduction   
1  PH_2016.06.08_Economy-Final.pdf  Research report / Introduction   

                                            question  \
0  According to the report, how do 5% of the Lati...   
1  According to the report, which one is greater ...   

                             answer evidence_pages evidence_sources  \
0                     Less well-off            [5]        ['Chart']   
1  Latinos interviewed by cellphone       [19, 20]        ['Table']   

  answer_format  
0           Str  
1           Str  


Unnamed: 0,doc_id,doc_type,question,answer,evidence_pages,evidence_sources,answer_format
0,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"According to the report, how do 5% of the Lati...",Less well-off,[5],['Chart'],Str
1,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"According to the report, which one is greater ...",Latinos interviewed by cellphone,"[19, 20]",['Table'],Str
2,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"From this report, which subgroup among Hispani...",Some college or more,[14],['Chart'],Str


In [8]:
import os
import fitz  # PyMuPDF库
from tqdm import tqdm

def get_slide_pdf_filenames(directory_path: str) -> list:
    """
    分析指定目录下的所有PDF文件，筛选出属于幻灯片(slides)的PDF，并返回其文件名列表。

    主要判断依据：
    1. 页面的宽高比是否为横向 (大于1.1)。
    2. 或，平均每页的单词数是否很少 (150)。

    Args:
        directory_path (str): 包含PDF文件的目录路径。

    Returns:
        list: 一个包含所有被识别为幻灯片PDF文件名的列表。
    """
    # 定义分类阈值
    ASPECT_RATIO_THRESHOLD = 1.1  # 宽高比阈值，大于此值基本可断定为PPT
    AVG_WORDS_THRESHOLD = 200     # 平均每页单词数阈值，小于此值可能为PPT

    slide_filenames = []

    if not os.path.isdir(directory_path):
        print(f"错误: 目录 '{directory_path}' 不存在。")
        return slide_filenames

    print(f"正在扫描目录: {directory_path}...")

    for filename in tqdm(os.listdir(directory_path)):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            is_slide = False
            try:
                with fitz.open(file_path) as doc:
                    if not doc:  # 跳过空文件或无法打开的文件
                        continue
                    
                    # 1. 分析页面宽高比
                    page = doc.load_page(0)
                    width, height = page.rect.width, page.rect.height
                    
                    if height > 0 and (width / height) > ASPECT_RATIO_THRESHOLD:
                        is_slide = True
                    
                    # 2. 如果宽高比不满足，再检查平均词数作为补充判断
                    if not is_slide:
                        num_pages = len(doc)
                        if num_pages > 0:
                            total_words = sum(len(p.get_text("words")) for p in doc)
                            avg_words_per_page = total_words / num_pages
                            
                            # 即使是纵向，如果平均词数极少，也可能是竖屏PPT或报告封面
                            # 增加一个页数判断，避免单页封面被误判
                            if avg_words_per_page < AVG_WORDS_THRESHOLD and num_pages > 2:
                                is_slide = True
                
                if is_slide:
                    slide_filenames.append(filename)

            except Exception as e:
                # 忽略无法处理的文件，并打印错误信息
                print(f"处理文件 '{filename}' 时出错: {e}")

    print(f"扫描完成！共发现 {len(slide_filenames)} 个幻灯片(slides)PDF文件。")
    return slide_filenames

slides_pdf_list = get_slide_pdf_filenames(pdf_dir)

正在扫描目录: /mnt/data/wangshu/mmrag/MMLongBench-Doc/data/documents...


 93%|█████████▎| 126/135 [00:22<00:02,  3.79it/s]

MuPDF error: syntax error: could not parse color space (301 0 R)

MuPDF error: syntax error: could not parse color space (557 0 R)

MuPDF error: syntax error: could not parse color space (617 0 R)

MuPDF error: syntax error: could not parse color space (659 0 R)

MuPDF error: syntax error: could not parse color space (692 0 R)

MuPDF error: syntax error: could not parse color space (718 0 R)

MuPDF error: syntax error: could not parse color space (749 0 R)

MuPDF error: syntax error: could not parse color space (795 0 R)

MuPDF error: syntax error: could not parse color space (840 0 R)

MuPDF error: syntax error: could not parse color space (868 0 R)

MuPDF error: syntax error: could not parse color space (889 0 R)

MuPDF error: syntax error: could not parse color space (1355 0 R)

MuPDF error: syntax error: could not parse color space (1394 0 R)

MuPDF error: syntax error: could not parse color space (1458 0 R)

MuPDF error: syntax error: could not parse color space (1508 0 R)

MuPDF 

100%|██████████| 135/135 [00:24<00:00,  5.62it/s]

扫描完成！共发现 49 个幻灯片(slides)PDF文件。





In [9]:
# Since Tutorial is Slides PDF, we cannot recognize the document structure (i.e., organizational structure) in those files.

document_only_df = data_df[~data_df['doc_id'].isin(slides_pdf_list)]
print(document_only_df.shape)
print(document_only_df["doc_id"].nunique())

document_only_df.head(2)

(687, 9)
86


Unnamed: 0,doc_id,doc_type,question,answer,evidence_pages,evidence_sources,answer_format,doc_uuid,doc_path
0,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"According to the report, how do 5% of the Lati...",Less well-off,[5],['Chart'],Str,a69d66dc-1f32-5a4c-85d7-25a4aa756138,/mnt/data/wangshu/mmrag/MMLongBench-Doc/data/d...
1,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"According to the report, which one is greater ...",Latinos interviewed by cellphone,"[19, 20]",['Table'],Str,a69d66dc-1f32-5a4c-85d7-25a4aa756138,/mnt/data/wangshu/mmrag/MMLongBench-Doc/data/d...


## Select dataset

In [10]:
import uuid
import os

sel_df = document_only_df.copy()

# assign a uuid for each unique pdf file into new column doc_uuid
sel_df["doc_uuid"] = sel_df["doc_id"].apply(
    lambda x: str(uuid.uuid5(uuid.NAMESPACE_DNS, x))
)

# assign full pdf_path into new column doc_path
sel_df["doc_path"] = sel_df["doc_id"].apply(
    lambda x: os.path.join(pdf_dir, x.replace('\\', '/'))
)
print(sel_df.shape)

# save the processed dataframe to json file
output_path = os.path.join(save_dir, "MMLongBench.json")
sel_df.to_json(output_path, orient="records", indent=2)

(687, 9)


In [None]:
import pandas as pd
output_path = os.path.join(save_dir, "MMLongBench.json")
data_df = pd.read_json(output_path, orient="records")
print(data_df.shape)
data_df.head(3)

(687, 9)


Unnamed: 0,doc_id,doc_type,question,answer,evidence_pages,evidence_sources,answer_format,doc_uuid,doc_path
0,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"According to the report, how do 5% of the Lati...",Less well-off,[5],['Chart'],Str,a69d66dc-1f32-5a4c-85d7-25a4aa756138,/mnt/data/wangshu/mmrag/MMLongBench-Doc/data/d...
1,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"According to the report, which one is greater ...",Latinos interviewed by cellphone,"[19, 20]",['Table'],Str,a69d66dc-1f32-5a4c-85d7-25a4aa756138,/mnt/data/wangshu/mmrag/MMLongBench-Doc/data/d...
2,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"From this report, which subgroup among Hispani...",Some college or more,[14],['Chart'],Str,a69d66dc-1f32-5a4c-85d7-25a4aa756138,/mnt/data/wangshu/mmrag/MMLongBench-Doc/data/d...


In [1]:
import pandas as pd

data_set_path = "/mnt/data/wangshu/mmrag/MMLongBench-Doc/dataset/MMLongBench.json"

data_df = pd.read_json(data_set_path, orient="records")
print(data_df.shape)

(682, 9)


In [2]:

answerable_rows = []
for idx, row in data_df.iterrows():
    if row["answer"] != "Not answerable":
        answerable_rows.append(row)

answerable_df = pd.DataFrame(answerable_rows)
print(answerable_df.shape)


(546, 9)


In [3]:
# 根据question字段去重，保留第一个出现的
print(f"before: {answerable_df.shape}")

# 使用drop_duplicates方法，根据question列去重，保留第一个出现的
deduped_df = answerable_df.drop_duplicates(subset=['question'], keep='first')

print(f"after: {deduped_df.shape}")
print(f"deleted: {len(answerable_df) - len(deduped_df)}")

deduped_df.head(2)

before: (546, 9)
after: (546, 9)
deleted: 0


Unnamed: 0,doc_id,doc_type,question,answer,evidence_pages,evidence_sources,answer_format,doc_uuid,doc_path
0,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"According to the report, how do 5% of the Lati...",Less well-off,[5],['Chart'],Str,a69d66dc-1f32-5a4c-85d7-25a4aa756138,/mnt/data/wangshu/mmrag/MMLongBench-Doc/data/d...
1,PH_2016.06.08_Economy-Final.pdf,Research report / Introduction,"According to the report, which one is greater ...",Latinos interviewed by cellphone,"[19, 20]",['Table'],Str,a69d66dc-1f32-5a4c-85d7-25a4aa756138,/mnt/data/wangshu/mmrag/MMLongBench-Doc/data/d...


In [4]:
save_path = "/mnt/data/wangshu/mmrag/MMLongBench-Doc/dataset/MMLongBench.json"
deduped_df.to_json(save_path, orient="records", indent=2)

## Dataset info 

In [3]:
# count the unique number of doc_id per doc_type
doc_types = data_df["doc_type"].unique()
print("\nUnique Document Types:")
print(doc_types)

doc_type_cnt = data_df["doc_type"].value_counts()
print("Document Type Counts:")
print(doc_type_cnt)

doc_id_cnt = data_df.groupby("doc_type")["doc_id"].nunique()
print("\nUnique Document ID Counts per Document Type:")
print(doc_id_cnt)


Unique Document Types:
['Research report / Introduction' 'Tutorial/Workshop' 'Academic paper'
 'Guidebook' 'Brochure' 'Administration/Industry file' 'Financial report']
Document Type Counts:
doc_type
Research report / Introduction    292
Academic paper                    199
Guidebook                         155
Tutorial/Workshop                 138
Financial report                  117
Brochure                          100
Administration/Industry file       81
Name: count, dtype: int64

Unique Document ID Counts per Document Type:
doc_type
Academic paper                    26
Administration/Industry file      10
Brochure                          15
Financial report                  11
Guidebook                         22
Research report / Introduction    34
Tutorial/Workshop                 17
Name: doc_id, dtype: int64
