In [1]:
from bs4 import BeautifulSoup #Beautiful Soup is a library that makes it easy to scrape information from web pages
import os
import pandas as pd
from unidecode import unidecode
import urllib.request #Functions for opening URLs
from pathlib import Path
import math
import json

In [2]:
# Dictionary mapping review categories to their corresponding prompts
list_review_text = {
    "contribution": "Please describe the contribution of the paper",
    "strengths": "Please list the main strengths of the paper",
    "weakness": "Please list the main weaknesses of the paper",
    "clarity": "Please rate the clarity and organization of this paper",
    "reproducibility": "Please comment on the reproducibility of the paper",
    "detailed": "Please provide detailed and constructive comments for the authors",
    "rate": "Rate the paper on a scale of 1-8, 8 being the strongest",
    "justification": "Please justify your recommendation.",
    "number of paper": "Number of papers in your stack",
    "ranking": "What is the ranking of this paper in your review stack?",
    "confidence": "Reviewer confidence",
    "rate rebuttal": "[Post rebuttal] After reading the author’s rebuttal, state your overall opinion of the paper if it has been changed",
    "justification rebuttal": "[Post rebuttal] Please justify your decision",
}

# List of string categories for reviews
list_categories_str = ["contribution", "strengths", "weakness", "reproducibility", "detailed", "justification"]

# List of categories for reviews with scores
list_categories_scores = ["clarity", "rate", "confidence", "rate rebuttal"]

# Columns for the reviews data frame
columns_reviews = ["id", "category", "title", "review 1", "review 2", "review 3"]

# Columns for the statistics data frame
columns_statistics = ["id", "category", "title", "words1", "words2", "words3"]


In [3]:
def get_accepted_paper_list(year: str = "2023"):
    """
    Get the list of all html files from the chosen year
    """

    # Base URL of the MICCAI conference website
    miccai_website_path = "https://conferences.miccai.org"

    # Constructing the URL to access the list of papers for the specified year
    path_online_list = miccai_website_path + f"/{year}/papers/"

    # Opening the URL and reading its contents
    reponse = urllib.request.urlopen(path_online_list)
    contenu_web = reponse.read().decode('UTF-8')

    # Parsing the HTML content using BeautifulSoup
    soup = BeautifulSoup(contenu_web, "html.parser")

    # Finding all anchor ('a') tags in the HTML
    all = soup.find_all('a')

    # Creating a list of URLs for papers. It filters for links that end with 'html'
    paper_list = [(miccai_website_path + link.get('href')) for link in all if link.get('href').endswith('html')]

    # Returning the list of paper URLs
    return paper_list


In [4]:
def extract_reviews(paper:str, df_reviews:pd.DataFrame, df_stats:pd.DataFrame, df_scores:pd.DataFrame, year:str="2023", list_categories:list=["reproducibility"]):
    """
    Extract the 3 first reviews from a given paper webpage.

    Parameters
    ----------
    paper: str
        URL of the paper webpage.
    df_reviews: pd.DataFrame 
        DataFrame to store extracted reviews and code links.
    df_stats: pd.DataFrame
        DataFrame to store statistics related to reviews (e.g., word count).
    df_scores: pd.DataFrame
        DataFrame to store reviews containing scores.
    year: str, optional
        Year of the conference. Defaults to "2023".
    list_categories: list[str], optional
        Categories to extract reviews for. Defaults to "reproducibility".
    """

    # Open HTML webpage and extract with BeautifulSoup
    response = urllib.request.urlopen(paper)
    content_web = response.read().decode('UTF-8')
    soup = BeautifulSoup(content_web, "html.parser")

    # Extract paper title and ID from the webpage
    if year == "2023":
        paper_title = soup.find("title").get_text().rstrip("MICCAI 2023 - Accepted Papers, Reviews, Author Feedback").rstrip(' |')
    elif year == "2022":
        paper_title = soup.find("title").get_text().rstrip("MICCAI 2022 - Accepted Papers and Reviews").rstrip(' |')
    paper_id = Path(paper).name[:13]

    # Extract code link from the webpage
    code_link = soup.find(id="code-id")
    code_link = code_link.find_next("p").get_text()
    code_link = code_link.replace("\n", " ")

    # Store code link information in the DataFrame
    df_reviews.loc[(paper_id, unidecode(paper_title)), ("code link", "code link")] = code_link

    # Loop through specified categories to extract reviews
    for category in list_categories:
        text = list_review_text[category]

        # Find paragraphs containing reviews for the specified category
        repro_reviews_paragraph = soup.find_all(lambda tag: tag.name == "li" and text in tag.text)
        repro_exact_text = soup.find(lambda tag: tag.name == "strong" and text in tag.text).get_text()

        i = 0
        # Loop through the first 3 reviews for the specified category
        for review in repro_reviews_paragraph[:3]:
            i += 1
            tmp_review = unidecode(review.get_text().strip(repro_exact_text))
            tmp_review = tmp_review.strip("\n")
            tmp_review = tmp_review.replace("\n          \n", " ")
            tmp_review = tmp_review.replace("\t", " ")
            tmp_review = tmp_review.replace("\n\n\n\n", " ")
            tmp_review = tmp_review.replace("\n\n\n", " ")
            tmp_review = tmp_review.replace("\n\n", " ")
            tmp_review = tmp_review.replace("\n", " ")
            tmp_review = tmp_review.replace("\t", " ")

            # Store review information in the appropriate DataFrame
            if category in list_categories_str:
                df_reviews.loc[(paper_id, unidecode(paper_title)), (category, f"review {i}")] = tmp_review
                df_stats.loc[(paper_id, unidecode(paper_title)), (category, f"review {i}")] = len(str(tmp_review).split())
            elif category in list_categories_scores:
                df_scores.loc[(paper_id, unidecode(paper_title)), (category, f"review {i}")] = tmp_review


In [5]:

def extract_paragraph(paper_list:list, year="2023"):
    """
    Extract the 3 first reviews of all paper from the given list.

    Return 3 DataFrames:
        - df_reviews: DataFrame containing the 3 first reviews of all papers
        - df_stats: DataFrame containing the number of words for each review of df_reviews
        - df_scores: DataFrame containing the scores given by the 3 first reviewers of all papers.
    """
    iterables_str = [list_categories_str, ["review 1", "review 2", "review 3"]]
    iterables_score = [list_categories_scores, ["review 1", "review 2", "review 3"]]

    index_line_str = pd.MultiIndex.from_product(iterables_str, names=["category", "review"])
    index_line_score = pd.MultiIndex.from_product(iterables_score, names=["category", "review"])
    index_column = pd.MultiIndex.from_product([[], []], names=["id", "title"])

    df_reviews = pd.DataFrame(index=index_column, columns=index_line_str)
    df_stats = pd.DataFrame(index=index_column, columns=index_line_str)
    df_scores = pd.DataFrame(index=index_column, columns=index_line_score)

    for i, paper in enumerate(paper_list):
        print(f"Processing paper {i + 1}/{len(paper_list)}", end='\r', flush=True)
        list_categories = list_categories_str + list_categories_scores
        extract_reviews(paper, df_reviews, df_stats, df_scores, year, list_categories)

    return df_reviews, df_stats, df_scores


In [6]:
def count_total_words(df_words, year_="2023", output_directory="results"):
    """
    Count total number of words of each review
    by summing the number of words of each category

    Parameters
    ----------
    df_words: pd.DataFrame
        DataFrame containing reviews and their word counts
    year_: str 
        Year for which the function is being applied (default: "2023")
    output_directory: str 
        Directory to save the results CSV file (default: "results")
    """

    # Define the path for the output CSV file
    path_words_count = output_directory / 'count_words.csv'

    # Iterate through unique IDs in the DataFrame
    for id, id_df in df_words.groupby(level=0):
        # Iterate through titles associated with the current ID
        for _, title in id_df.index.values:
            # Initialize word count for each of the three reviews
            for i in range(1, 4):
                df_words.loc[(id, title), ("total", f"review {i}")] = 0

            # Iterate through categories in the list_categories_str
            for category in list_categories_str:
                # Sum the words from each category for each review
                for i in range(1, 4):
                    if math.isnan(df_words.loc[(id, title), (category, f"review {i}")]):
                        # Handle NaN values by replacing them with 0
                        df_words.loc[(id, title), (category, f"review {i}")] = 0
                    df_words.loc[(id, title), ("total", f"review {i}")] += df_words.loc[(id, title), (category, f"review {i}")]

    # Sort the resulting DataFrame and save it to a CSV file
    df_words.sort_index(axis=1, ascending=True, inplace=True)
    df_words.to_csv(path_words_count, index=True, sep="\t", encoding='utf-8')


In [7]:
def get_repro_copy_paste(df_all_reviews, output_directory, threshold: int = 10):
    """
    Find reviewers that have copy/paste the reproducibility review in another category.

    Parameters
    ----------
    df_all_reviews: pd.DataFrame
        DataFrame containing reviews for each paper
    output_directory: str 
        Directory to save the results CSV file
    threshold: int 
        Minimum word count for a review to be considered (default: 10)
    """
    from copy import copy

    # Create a deep copy of the input DataFrame to preserve the original data
    df_all_reviews_wo_copy_paste = copy(df_all_reviews)

    # Initialize a new DataFrame to store reviews with copy/paste
    df_bad_reviews = pd.DataFrame(columns=columns_reviews)
    df_bad_reviews.set_index(["id", "category"], inplace=True)

    # Iterate through unique IDs in the DataFrame
    for id, id_df in df_all_reviews.groupby(level=0):
        # Iterate through titles associated with the current ID
        for _, title in id_df.index.values:
            # Iterate through categories in list_categories_str
            for category in list_categories_str:
                # Check if the category is not "reproducibility"
                if category != "reproducibility":
                    # Iterate through review indices (1 and 2)
                    for i in range(1, 3):
                        # Extract reproducibility and category reviews
                        repro = id_df.loc[(id, title), ("reproducibility", f"review {i}")]
                        cate = id_df.loc[(id, title), (category, f"review {i}")]

                        # Check if the reproducibility review is long enough and is a substring of the category review
                        if len(str(repro).split()) >= threshold and str(repro) in str(cate):
                            # Store the problematic reviews in the df_bad_reviews DataFrame
                            df_bad_reviews.loc[(id, category), "title"] = title
                            df_bad_reviews.loc[(id, category), f"review {i}"] = id_df.loc[(id, title), (category, f"review {i}")]
                            df_bad_reviews.loc[(id, category), "reproducibility"] = id_df.loc[(id, title), ("reproducibility", f"review {i}")]

                            # Drop the row from the df_all_reviews_wo_copy_paste DataFrame (not used currently)
                            try:
                                df_all_reviews_wo_copy_paste.drop((id, title), inplace=True)
                            except:
                                pass

    # Save the results to CSV files
    df_bad_reviews.to_csv(os.path.join(output_directory, f'reviews_copy_paste_{threshold}.csv'), index=True, sep="\t", encoding='utf-8')


In [8]:
def count_checklist(df_all_reviews, output_directory, category="reproducibility"):
    """
    Find reviewers that have mentioned the word 'checklist' in the reproducibility review.

    Parameters
    ----------
    df_all_reviews: pd.DataFrame
        DataFrame containing reviews for each paper
    output_directory: str 
        Directory to save the results CSV file
    category: str 
        The review category to analyze (default: "reproducibility")
    """
    # Initialize a new DataFrame to store reviews mentioning 'checklist'
    df_checklist = pd.DataFrame(columns=columns_reviews)
    df_checklist.set_index(["id", "title"], inplace=True)

    # Iterate through unique IDs in the DataFrame
    for id, id_df in df_all_reviews.groupby(level=0):
        # Iterate through titles associated with the current ID
        for _, title in id_df.index.values:
            # Iterate through review indices (1 and 2)
            for i in range(1, 3):
                # Extract the review text from the specified category
                review = str(df_all_reviews.loc[(id, title), (category, f"review {i}")])

                # Check if the review mentions 'checklist' in various formats
                if ("check-list" in review) or ("checklist" in review) or ("check list" in review):
                    # Store the problematic reviews in the df_checklist DataFrame
                    df_checklist.loc[(id, title), f"review {i}"] = df_all_reviews.loc[(id, title), (category, f"review {i}")]
                    df_checklist.loc[(id, title), "category"] = category

    # Save the results to a CSV file
    df_checklist.to_csv(os.path.join(output_directory, f'{category}_checklist_reviews.csv'), index=True, sep="\t", encoding='utf-8')


In [9]:
def create_rating_excel(df_all_reviews, output_directory,year: str = "2023"):
    """
    Save the excel file with the reproducibility reviews to create the rating file
    """
    df_repro_excel = df_all_reviews.loc[:, ("reproducibility")]
    df_repro_excel.to_excel(os.path.join(output_directory ,f'reviews_reproducibility_{year}.xlsx'))

In [10]:
# Note: Extraction may take 5-10 minutes

# Default year is 2023
# To change to another year, modify the variable below
year_ = "2023"

# Get the list of HTML pages for the different papers
paper_list = get_accepted_paper_list(year=year_)

# Set output directories and files for the extraction
output_directory = Path(f"../miccai{year_}")
output_directory = Path("../miccaitest")
if not output_directory.is_dir():
    os.mkdir(output_directory)

csv_directory = output_directory / "extract-csv"
if not csv_directory.is_dir():
    os.mkdir(csv_directory)

path_all_reviews = csv_directory / 'reviews.csv'
path_all_stats = csv_directory / 'count_words.csv'
path_all_scores = csv_directory / 'scores.csv'

# Skip the extraction if the files already exist
# To rerun the extraction, remove the directory "miccaiYYYY" where YYYY is the year
if (not path_all_reviews.is_file()) or (not path_all_stats.is_file()) or (not path_all_scores.is_file()):
    print(f"Extract reviews and count words for year {year_}")
    
    # Extract reviews, stats, and scores
    df_all_reviews, df_all_stats, df_all_scores = extract_paragraph(paper_list, year_)
    
    # Save the results to CSV files
    df_all_scores.to_csv(path_all_scores, index=True, sep="\t", encoding='utf-8')
    df_all_reviews.to_csv(path_all_reviews, index=True, sep="\t", encoding='utf-8')
    df_all_stats.to_csv(path_all_stats, index=True, sep="\t", encoding='utf-8')

else:
    print(f"Files already exist")
    print(f"Skipping extraction and importing existing CSV from {output_directory}")
    print(f"If you want to rerun the extraction, delete the directory miccaiYYYY where YYYY is the year")
    
    # Load existing CSV files
    df_all_reviews = pd.read_csv(path_all_reviews, sep="\t", header=[0, 1], index_col=[0, 1], skip_blank_lines=True)
    df_all_stats = pd.read_csv(path_all_stats, sep="\t", header=[0, 1], index_col=[0, 1], skip_blank_lines=True)


Extract reviews and count words for year 2023
Processing paper 730/730

In [11]:
print(f"Processing year {year_} and outputting result in {output_directory}")
# Count the total number of words in each review
# Extracted results are saved in 'count_words.csv'
count_total_words(df_words=df_all_stats, year_=2023, output_directory=csv_directory)

print("Identify reproducibility reviews which contain copy/paste from other parts (need to have more than 10 consecutive words in common)")
# Find reviewers that have copy/pasted the reproducibility review in another category
# Results are saved in 'reviews_copy_paste_{threshold}.csv'
get_repro_copy_paste(df_all_reviews=df_all_reviews, output_directory=csv_directory)

print("Count the number of reviews that mention the checklist")
# Find reviews that mention the word 'checklist' in the reproducibility review
# Results are saved in 'reproducibility_checklist_reviews.csv'
count_checklist(df_all_reviews=df_all_reviews, output_directory=csv_directory, category="reproducibility")

human_rating_dir = '../human_rating/'
print("Create excel files for human rating")
# Create Excel files for human rating
create_rating_excel(df_all_reviews=df_all_reviews, output_directory=human_rating_dir, year=year_)


Processing year 2023 and outputting result in ../miccaitest
Identify reproducibility reviews which contain copy/paste from other parts (need to have more than 10 consecutive words in common)
Count the number of reviews that mention the checklist
Create excel files for human rating
