In [None]:
# Install openreview-py and ensure urllib3 stays compatible
!pip install openreview-py "urllib3<2.0"

In [None]:
import openreview
import pandas as pd
import tqdm
from collections import defaultdict

def get_full_conversation(client, venue_id, submission_invitation):
    """Generic collector: reviews + all discussion responses for a venue."""
    print(f"1. Getting submission list ({submission_invitation})...")
    submissions = client.get_all_notes(invitation=submission_invitation)
    print(f"2. Found {len(submissions)} submissions.")

    all_reviews_data = []

    for note in tqdm.tqdm(submissions):
        paper_id = note.id
        paper_title = note.content.get('title', {}).get('value')
        paper_number = note.number

        # 1. 获取该论文论坛下的所有笔记（包含 Review, Rebuttal, Metareview 等）
        forum_notes = client.get_all_notes(forum=paper_id)

        # 2. 构建“父子关系图” (Adjacency List)
        reply_map = defaultdict(list)
        reviews = []
        decision_note = None

        for n in forum_notes:
            if n.replyto:
                reply_map[n.replyto].append(n)

            if n.invitations and any('Decision' in inv for inv in n.invitations):
                decision_note = n

            # Official_Review (ICLR) or Review (some venues e.g. COLM)
            if n.invitations and (
                any('Official_Review' in inv for inv in n.invitations)
                or any(inv.endswith('/Review') or inv.endswith('/Official_Review') for inv in (n.invitations or []))
            ):
                reviews.append(n)

        # 获取 Decision 结果
        decision_value = "None"
        if decision_note:
            decision_value = decision_note.content.get('decision', {}).get('value', 'None')

        # 3. 处理每一篇 Review，抓取其下的完整对话树
        for review in reviews:
            review_id = review.id
            review_content = review.content

            # --- 核心逻辑：获取该 Review 下的所有子孙对话 ---
            discussion_nodes = get_all_descendants(review_id, reply_map)

            # 按时间排序 (tmdate) 保证对话顺序
            discussion_nodes.sort(key=lambda x: x.tmdate)

            # 格式化对话文本
            transcript_lines = []
            if discussion_nodes:
                for node in discussion_nodes:
                    # 1. 识别说话人 (Speaker)
                    speaker = "Unknown"
                    sigs = node.signatures
                    if any('Authors' in s for s in sigs):
                        speaker = "Authors"
                    elif any('Reviewer' in s for s in sigs):
                        # 尝试提取 Reviewer 编号，如 .../Reviewer_2 -> Reviewer 2
                        speaker = sigs[0].split('/')[-1]
                    elif any('Area_Chair' in s for s in sigs):
                        speaker = "Area Chair"

                    # 2. 提取内容
                    text = node.content.get('comment', {}).get('value', '')
                    if not text:
                        # 有些早期回复内容可能在 review 字段里，做个兼容
                        text = node.content.get('review', {}).get('value', '')

                    # 3. 拼接
                    transcript_lines.append(f"[{speaker}]: {text}")

                discussion_transcript = "\n\n".join(transcript_lines)
            else:
                discussion_transcript = "None"

            all_reviews_data.append({
                'paper_number': paper_number,
                'paper_title': paper_title,
                'decision': decision_value,
                'review_id': review_id,
                'rating': review_content.get('rating', {}).get('value'),
                'confidence': review_content.get('confidence', {}).get('value'),
                'review_text': review_content.get('review', {}).get('value'),
                'discussion_transcript': discussion_transcript  # <--- 新名字，包含完整多轮对话
            })

    df = pd.DataFrame(all_reviews_data)
    return df


def get_iclr_2024_full_conversation():
    client = openreview.api.OpenReviewClient(baseurl="https://api2.openreview.net")
    venue_id = "ICLR.cc/2024/Conference"
    return get_full_conversation(client, venue_id, f"{venue_id}/-/Submission")


def get_colm_2024_full_conversation():
    client = openreview.api.OpenReviewClient(baseurl="https://api2.openreview.net")
    venue_id = "colmweb.org/COLM/2024/Conference"
    return get_full_conversation(client, venue_id, f"{venue_id}/-/Submission")


# --- 辅助函数：递归查找所有子孙节点 ---
def get_all_descendants(root_id, reply_map):
    """
    输入 root_id (Review ID) 和 reply_map (父子关系表)
    输出该节点下所有的子孙笔记列表 (不包含 root 本身)
    """
    descendants = []
    # 找到直接回复 root 的节点
    children = reply_map.get(root_id, [])

    for child in children:
        descendants.append(child)
        # 递归：把 child 的子孙也加进来
        descendants.extend(get_all_descendants(child.id, reply_map))

    return descendants

if __name__ == "__main__":
    from pathlib import Path
    out_dir = Path("COLM/2024")
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "colm2024_full_transcript.csv"

    df = get_colm_2024_full_conversation()

    print("\n---------------- RESULTS ----------------")
    if not df.empty:
        print(f"Collected {len(df)} reviews with full conversation threads.")
        mask = df["discussion_transcript"].str.contains("Reviewer", na=False)
        if mask.any():
            print("\n--- Sample Conversation ---")
            print(df.loc[mask, "discussion_transcript"].iloc[0][:500] + "...\n")
        df.to_csv(out_path, index=False)
        print(f"Saved to {out_path}")
    else:
        print("No data found.")

## Run COLM 2024 collection

Run the cell below to fetch review opinions and all responses for COLM 2024 and save to `COLM/2024/colm2024_full_transcript.csv`. Ensure your working directory is the repo root (DatasetsPaper).

In [1]:
from pathlib import Path

OUTPUT_DIR = Path("COLM/2024")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
out_path = OUTPUT_DIR / "colm2024_full_transcript.csv"

df_colm = get_colm_2024_full_conversation()

print("\n---------------- COLM 2024 RESULTS ----------------")
if not df_colm.empty:
    print(f"Collected {len(df_colm)} reviews with full conversation threads.")
    mask = df_colm["discussion_transcript"].str.contains("Reviewer", na=False)
    if mask.any():
        print("\n--- Sample conversation ---")
        print(df_colm.loc[mask, "discussion_transcript"].iloc[0][:500] + "...\n")
    df_colm.to_csv(out_path, index=False)
    print(f"Saved to {out_path}")
else:
    print("No data found.")

NameError: name 'get_colm_2024_full_conversation' is not defined