In [None]:
import pandas as pd
import uuid
import os

base_path = "PATH_TO_DATASET"
data_file = os.path.join(base_path, "m3docvqa/multimodalqa/MMQA_dev.jsonl")
doc_path = os.path.join(base_path, "data/documents")
save_path = os.path.join(base_path, "data/m3docrag.json")

data_df = pd.read_json(data_file, lines=True)
print(data_df.shape)

print(data_df.columns)
data_df.head(2)

(2441, 5)
Index(['qid', 'question', 'answers', 'metadata', 'supporting_context'], dtype='object')


Unnamed: 0,qid,question,answers,metadata,supporting_context
0,a33985b1e8b2502fc18cc8147dc27db8,For which film did Ben Piazza play the role of...,"[{'answer': 'Mask', 'type': 'string', 'modalit...","{'wiki_entities_in_question': [], 'wiki_entiti...",[{'doc_id': '8513db80c11ea439ab11eba406ec00d9'...
1,710a6d2254076ea58756c6c7cc211f1e,When was the movie that had Ben Piazza in the ...,"[{'answer': '1976', 'type': 'string', 'modalit...","{'type': 'Compose(TextQ,TableQ)', 'modalities'...",[{'doc_id': '8513db80c11ea439ab11eba406ec00d9'...


## Select dataset

In [2]:
sel_data_df = data_df.copy(deep=True)
sel_data_df = sel_data_df[sel_data_df["supporting_context"].apply(len) == 1]

sel_data_df = sel_data_df[sel_data_df["answers"].apply(len) == 1]


def extract_answer(row):
    answers = row["answers"]
    ans = answers[0].get("answer", "")
    if ans == "":
        ans = "No answer found"
        print(row)
    return ans


sel_data_df["answer"] = sel_data_df.apply(extract_answer, axis=1)


def extract_doc_id(row):
    supporting_context = row["supporting_context"]
    doc_id = supporting_context[0].get("doc_id", "")
    if doc_id == "":
        print(row)
    return doc_id


sel_data_df["doc_id"] = sel_data_df.apply(extract_doc_id, axis=1)
sel_data_df["doc_uuid"] = sel_data_df["doc_id"].apply(
    lambda x: str(uuid.uuid5(uuid.NAMESPACE_DNS, x))
)
sel_data_df["doc_path"] = sel_data_df.apply(
    lambda row: os.path.join(doc_path, row["doc_id"] + ".pdf"), axis=1
)


first_cols = ["qid", "question", "answer", "doc_id", "doc_uuid", "doc_path"]
cols = first_cols + [col for col in sel_data_df.columns if col not in first_cols]
sel_data_df = sel_data_df[cols]

print(sel_data_df.columns)
print(sel_data_df.shape)

sel_data_df.head(2)

Index(['qid', 'question', 'answer', 'doc_id', 'doc_uuid', 'doc_path',
       'answers', 'metadata', 'supporting_context'],
      dtype='object')
(836, 9)


Unnamed: 0,qid,question,answer,doc_id,doc_uuid,doc_path,answers,metadata,supporting_context
0,a33985b1e8b2502fc18cc8147dc27db8,For which film did Ben Piazza play the role of...,Mask,8513db80c11ea439ab11eba406ec00d9,63a6b3f4-ebee-5024-b87b-84a9bcc26a63,/mnt/data/wangshu/mmrag/m3docrag/data/document...,"[{'answer': 'Mask', 'type': 'string', 'modalit...","{'wiki_entities_in_question': [], 'wiki_entiti...",[{'doc_id': '8513db80c11ea439ab11eba406ec00d9'...
4,18ecd2ac6c0ac69993b92dc4b30137e8,"Which Title(s), in Filmography of Ben Piazza, ...","Tell Me That You Love Me, Junie Moon",ddf8b52a8400deaf05940c5cad8169cd,f13a6b03-b065-51b5-95d8-8f45efacb1ed,/mnt/data/wangshu/mmrag/m3docrag/data/document...,"[{'answer': 'Tell Me That You Love Me, Junie M...","{'wiki_entities_in_question': [], 'wiki_entiti...",[{'doc_id': 'ddf8b52a8400deaf05940c5cad8169cd'...


In [15]:
from PyPDF2 import PdfReader
from PyPDF2.errors import PdfReadError
from tqdm import tqdm
# 1. 获取所有唯一的PDF文件路径
unique_pdf_paths = sel_data_df['doc_path'].unique()

# 2. 遍历每个唯一的PDF，获取页数
error_files = []
page_map = {}

print(len(unique_pdf_paths))

# 使用 tqdm 创建一个进度条
for path in tqdm(unique_pdf_paths):
    try:
        # 以二进制读取模式打开PDF文件
        with open(path, 'rb') as f:
            # 创建一个PDF读取器对象
            reader = PdfReader(f)
            # 获取页数并添加到列表中
            page_map[path] = len(reader.pages)
    except FileNotFoundError:
        error_files.append(path)
        # print(f"错误：文件未找到 -> {path}")
    except PdfReadError:
        error_files.append(path)
        # print(f"错误：无法读取PDF文件（可能已损坏） -> {path}")
    except Exception as e:
        error_files.append(path)
        # print(f"未知错误处理 '{path}': {e}")

sel_data_df["page_count"] = sel_data_df["doc_path"].map(page_map)
sel_data_df.head(3)


826


100%|██████████| 826/826 [00:07<00:00, 110.26it/s]


Unnamed: 0,qid,question,answer,doc_id,doc_uuid,doc_path,answers,metadata,supporting_context,page_count
0,a33985b1e8b2502fc18cc8147dc27db8,For which film did Ben Piazza play the role of...,Mask,8513db80c11ea439ab11eba406ec00d9,63a6b3f4-ebee-5024-b87b-84a9bcc26a63,/mnt/data/wangshu/mmrag/m3docrag/data/document...,"[{'answer': 'Mask', 'type': 'string', 'modalit...","{'wiki_entities_in_question': [], 'wiki_entiti...",[{'doc_id': '8513db80c11ea439ab11eba406ec00d9'...,4
4,18ecd2ac6c0ac69993b92dc4b30137e8,"Which Title(s), in Filmography of Ben Piazza, ...","Tell Me That You Love Me, Junie Moon",ddf8b52a8400deaf05940c5cad8169cd,f13a6b03-b065-51b5-95d8-8f45efacb1ed,/mnt/data/wangshu/mmrag/m3docrag/data/document...,"[{'answer': 'Tell Me That You Love Me, Junie M...","{'wiki_entities_in_question': [], 'wiki_entiti...",[{'doc_id': 'ddf8b52a8400deaf05940c5cad8169cd'...,4
6,e1e6ed53f9ad11813845088f4cf2f6b1,"Which Club(s), in Career statistics | Club of ...",FK Tuzla City,52830c5f8b6e8b19add8756c8f56576d,913e5538-001d-5201-91ea-8614a1129602,/mnt/data/wangshu/mmrag/m3docrag/data/document...,"[{'answer': 'FK Tuzla City', 'type': 'string',...","{'wiki_entities_in_question': [], 'wiki_entiti...",[{'doc_id': '52830c5f8b6e8b19add8756c8f56576d'...,5


In [16]:
import random
filter_data_df = sel_data_df.copy(deep=True)
filter_data_df = filter_data_df[filter_data_df["page_count"] < 20]
print(filter_data_df.shape)
print(filter_data_df["doc_uuid"].nunique())

unique_uuids = filter_data_df['doc_uuid'].unique()

num_to_sample = 500
RANDOM_SEED = 42  

random.seed(RANDOM_SEED)
sampled_uuids = random.sample(list(unique_uuids), num_to_sample)
print(f"成功从 {len(unique_uuids)} 个独立uuid中随机抽取了 {len(sampled_uuids)} 个。")


final_data_df = filter_data_df[filter_data_df['doc_uuid'].isin(sampled_uuids)].copy()

print("-" * 30)
print("最终抽样结果:")
print(f"最终数据集行数: {final_data_df.shape[0]}")
print(f"最终数据集中独立doc_uuid数量: {final_data_df['doc_uuid'].nunique()}")

final_data_df.head(2)

(661, 10)
653
成功从 653 个独立uuid中随机抽取了 500 个。
------------------------------
最终抽样结果:
最终数据集行数: 508
最终数据集中独立doc_uuid数量: 500


Unnamed: 0,qid,question,answer,doc_id,doc_uuid,doc_path,answers,metadata,supporting_context,page_count
0,a33985b1e8b2502fc18cc8147dc27db8,For which film did Ben Piazza play the role of...,Mask,8513db80c11ea439ab11eba406ec00d9,63a6b3f4-ebee-5024-b87b-84a9bcc26a63,/mnt/data/wangshu/mmrag/m3docrag/data/document...,"[{'answer': 'Mask', 'type': 'string', 'modalit...","{'wiki_entities_in_question': [], 'wiki_entiti...",[{'doc_id': '8513db80c11ea439ab11eba406ec00d9'...,4
4,18ecd2ac6c0ac69993b92dc4b30137e8,"Which Title(s), in Filmography of Ben Piazza, ...","Tell Me That You Love Me, Junie Moon",ddf8b52a8400deaf05940c5cad8169cd,f13a6b03-b065-51b5-95d8-8f45efacb1ed,/mnt/data/wangshu/mmrag/m3docrag/data/document...,"[{'answer': 'Tell Me That You Love Me, Junie M...","{'wiki_entities_in_question': [], 'wiki_entiti...",[{'doc_id': 'ddf8b52a8400deaf05940c5cad8169cd'...,4


In [17]:
# check for all exist
for i in range(len(final_data_df)):
    tmp_doc_path = final_data_df.iloc[i]["doc_path"]
    if not os.path.exists(tmp_doc_path):
        print("Not exist")
        print(tmp_doc_path)

In [18]:
final_data_df.to_json(save_path, orient="records", indent=2)
print(f"Saved to {save_path}")

Saved to /mnt/data/wangshu/mmrag/m3docrag/data/m3docrag.json


## Dataset info

In [6]:
doc_num = sel_data_df["doc_id"].nunique()
print(f"Total {len(sel_data_df)} samples, {doc_num} unique documents.")

Total 836 samples, 826 unique documents.


In [6]:
data_df['answers'].apply(len).value_counts()

answers
1     2247
2      106
3       41
4       15
6        9
7        7
9        5
5        4
8        2
11       2
17       1
12       1
10       1
Name: count, dtype: int64

In [4]:
# count the len of supporting_context, add new col
data_df['supporting_context_len'] = data_df['supporting_context'].apply(len)

print(data_df[data_df['supporting_context_len']==1].shape)


(858, 6)
