# Import, load and preprocess the data

In [1]:
import pandas as pd
import numpy as np
import json, os, time

from llm import LLM, Envs
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


from hc import name_clusters_with_topics, get_representative_docs, get_cluster_keywords, hdbscan_kmeans_refine, robust_json_parse
from hc import build_cluster_hierarchy, print_clusters, iterative_higher_level_clustering


llm = LLM (
    # model="gpt-4.5-preview",
    model = "openai/gpt-4.1",
    llm_env = Envs.OPENROUTER.value
)





In [2]:
df = pd.read_csv('tp_2020conference.csv')
df.columns

Index(['title', 'publish_time', 'abstract', 'keyword', 'tL_DL', 'titlelength',
       'paper_decision_time', 'paper_decision', 'paper_decision_comment',
       'paper_decision_commentlength', 'review_publish_time', 'reviewer_level',
       'paper_score', 'rate', 'review_score_one', 'review_score_two',
       'review_score_three', 'review', 'review_contentlength'],
      dtype='object')

In [3]:
df.columns #= ['id', 'title', 'abstract', 'keywords', 'authors', 'venue', 'year

Index(['title', 'publish_time', 'abstract', 'keyword', 'tL_DL', 'titlelength',
       'paper_decision_time', 'paper_decision', 'paper_decision_comment',
       'paper_decision_commentlength', 'review_publish_time', 'reviewer_level',
       'paper_score', 'rate', 'review_score_one', 'review_score_two',
       'review_score_three', 'review', 'review_contentlength'],
      dtype='object')

In [3]:
df['paper_decision'].value_counts()

paper_decision
Decision:###Reject                4638
Decision:###Accept (Poster)       1588
Decision:###Accept (Spotlight)     320
Decision:###Accept (Talk)          145
Name: count, dtype: int64

In [4]:
decision_to_score = {
    'Decision:###Reject': 0,
    'Decision:###Accept (Poster)': 1,
    'Decision:###Accept (Spotlight)': 2,
    'Decision:###Accept (Talk)': 3
}
# Map decisions to scores
df['paper_decision_numeric'] = df['paper_decision'].map(decision_to_score)

In [5]:
# number of distinct titles
df['paper_decision_numeric'].value_counts()


paper_decision_numeric
0.0    4638
1.0    1588
2.0     320
3.0     145
Name: count, dtype: int64

In [6]:
# df with only accpetance and review
df_accept = df[df['paper_decision_numeric'] > 0][['title', 'review', 'paper_decision_comment','paper_decision_numeric' ]]

# df with only rejection and review
df_reject = df[df['paper_decision_numeric'] == 0][['title', 'review', 'paper_decision_comment','paper_decision_numeric' ]]

In [7]:
df[:7500]['paper_decision_numeric'].value_counts()

paper_decision_numeric
0.0    4638
1.0    1588
2.0     199
Name: count, dtype: int64

# LLM only

In [42]:
from jinja2 import Template
import pandas as pd



# Jinja2 template with your column names
template_str = """
Analyze the following paper reviews and their decision scores (0 = reject, higher numbers = stronger acceptance). Provide the main reasons for acceptance or rejection for each paper in a concise summary.

{% for index, row in reviews.iterrows() %}
Paper {{ index + 1 }}:
Review: {{ row.review }}
Decision Score: {{ row.paper_decision_numeric }}
{% endfor %}
"""
template = Template(template_str)


# Render the prompt with dataframe data
prompt = template.render(reviews=df[:500])


response = llm._call(prompt)
print("LLM Response:\n", response)

2025-04-15 23:28:24,953 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response:
 Below is a concise summary of the main reasons for acceptance or rejection for each paper, based on the reviews and decision scores:

---

**Paper 1 (Score: 0.0, Reject)**
- *Main reasons for rejection:*  
  - Lack of rigorous experimentation: Does not compare to some relevant previous works.
  - Limited scope: Evaluated only on image classification; no evidence for applicability to other domains.
  - Results are only marginally better than competitors and inconsistently analyzed.
  - Missing ablation studies and deeper/theoretical analysis.

---

**Paper 2 (Score: 0.0, Reject)**
- *Main reasons for rejection:*  
  - Experiments do not show state-of-the-art results.
  - Evaluated only on small standard datasets, lacking larger or more difficult examples.
  - Algorithm design and convergence properties are unclear and poorly justified.
  - Missing theoretical guarantees and inadequate explanation of key procedures.

---

**Paper 3 (Score: 0.0, Reject)**
- *Main reasons fo

In [43]:
from jinja2 import Template

# Simulated previous LLM response (replace with your actual response)

# Jinja2 template for the new prompt
template_str = """
You are an expert in analyzing academic paper reviews. Below is a list of reviews for multiple papers, each with main reasons for acceptance or rejection. Your task is to:

1. Identify the top ten reasons for acceptance and the top ten reasons for rejection across all papers.
2. For each reason, provide a brief desription of its importance in the determination of acceptance or rejection.

Do not base your analysis solely on frequency. Try to determine how important each reason is in the context of the paper reviews.
Base your decision solely on the reviews provided. Do not include any external information or personal opinions or prior knowledge you have on how papers are evaluated.

**Input Response:**
{{ previous_response }}

**Output Format:**
Provide your answer in this exact format:

**Top 10 Reasons for Acceptance:**
1. [Reason] (Frequency: [N])
   - Importance: [Brief explanation]
2. ...

**Top 10 Reasons for Rejection:**
1. [Reason] (Frequency: [N])
   - Importance: [Brief explanation]
2. ...

Ensure your analysis is accurate, groups similar reasons (e.g., "lacks clarity" and "poor presentation" if related), and prioritizes clarity in ranking.
Describe how you arrived at determining the reasons and their importance.
"""
template = Template(template_str)

# Render the prompt
prompt = template.render(previous_response=response)



# Make the LLM call
reasons_oneshot = llm._call(prompt)
print("LLM Response:\n", reasons_oneshot)

2025-04-15 23:29:10,994 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response:
 Certainly! Below is an analytical summary, as requested, of the **top 10 reasons for acceptance** (inferred, since no acceptances are present, these are reverse-engineered from the rejections as their opposites) and the **top 10 reasons for rejection** (based directly on the provided reviews). 

Reasons are grouped and ranked by assessing their centrality to the decision (i.e., whether papers lacking a feature are almost always rejected), with frequency and explicit reviewer emphasis both taken into account. Synonymous or closely related reasons are combined for clarity and focus.

---

**Top 10 Reasons for Acceptance:**  
*(Inferred from the most common and important grounds for rejection, as acceptances were not given in the summaries)*

1. **Strong Technical Novelty and Contribution**
   - Importance: A paper presenting genuinely new ideas or techniques, rather than incremental or trivial extensions, is often seen as necessary for publication since it advances the fie

### Formatted output
See output at : https://docs.google.com/document/d/122jHdVx9C_YkVQx6_Fqm2NNKev2YOgLN1Jb6JeeNa9E/edit?usp=sharing

## with a balanced sample of papers with different acceptance scores

In [43]:
from jinja2 import Template
import pandas as pd

# get a sample of df into a sampled_df stratified by the field paper_decision_numeric
from sklearn.model_selection import train_test_split
#pick only where paper_decision_numeric is not null
df = df[df['paper_decision_numeric'].notnull()]
balanced_df, ydf = train_test_split(df, test_size=0.9, stratify=df['paper_decision_numeric'], random_state=42)
balanced_df = balanced_df.sample(n=400, random_state=42)

# Jinja2 template with your column names
template_str = """
Analyze the following paper reviews and their decision scores (0 = reject, higher numbers = stronger acceptance). Provide the main reasons for acceptance or rejection for each paper in a concise summary.

{% for index, row in reviews.iterrows() %}
Paper {{ index + 1 }}:
Review: {{ row.review }}
Decision Score: {{ row.paper_decision_numeric }}
{% endfor %}
"""
template = Template(template_str)


# Render the prompt with dataframe data
prompt = template.render(reviews=balanced_df)


response = llm._call(prompt)
print("LLM Response:\n", response)

2025-04-16 10:21:01,309 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response:
 Here is a concise summary of the main reasons for acceptance or rejection for each paper, based on the provided reviews and decision scores:

---

**Paper 1483**  
Decision: 0.0 (Reject)  
**Main reason:**  
- Insufficient justification of technical design choices (function forms, hyperparameters), unconvincing arguments for the “global score” concept, lack of theoretical explanation for fast runtime, and evaluation limited to intrinsic measures; needs extrinsic benchmarks and more principled analysis.

---

**Paper 1091**  
Decision: 0.0 (Reject)  
**Main reason:**  
- Marginal improvements on standard benchmarks; motivation tenuous; unclear where gains originate (possible confounds from other modifications); insufficient ablation studies and qualitative analysis; in need of broader application tests and clearer motivation.

---

**Paper 2108**  
Decision: 0.0 (Reject)  
**Main reason:**  
- Assumptions (especially unbiased stochastic subgradients) possibly do not hold,

In [None]:
from jinja2 import Template


# Jinja2 template for the new prompt
template_str = """
You are an expert in analyzing academic paper reviews. Below is a list of reviews for multiple papers, each with main reasons for acceptance or rejection. Your task is to:

1. Identify the top ten reasons for acceptance and the top ten reasons for rejection across all papers.
2. For each reason, provide a brief desription of its importance in the determination of acceptance or rejection.

Do not base your analysis solely on frequency. Try to determine how important each reason is in the context of the paper reviews.
Base your decision solely on the reviews provided. Do not include any external information or personal opinions or prior knowledge you have on how papers are evaluated.

**Input Response:**
{{ previous_response }}

**Output Format:**
Provide your answer in this exact format:

**Top 10 Reasons for Acceptance:**
1. [Reason] (Frequency: [N])
   - Importance: [Brief explanation]
2. ...

**Top 10 Reasons for Rejection:**
1. [Reason] (Frequency: [N])
   - Importance: [Brief explanation]
2. ...

Ensure your analysis is accurate, groups similar reasons (e.g., "lacks clarity" and "poor presentation" if related), and prioritizes clarity in ranking.
Describe how you arrived at determining the reasons and their importance.
"""
template = Template(template_str)

# Render the prompt
prompt = template.render(previous_response=response)



# Make the LLM call
reasons_oneshot = llm._call(prompt)
print("LLM Response:\n", reasons_oneshot)

2025-04-16 10:24:46,333 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response:
 **Top 10 Reasons for Acceptance:**

1. **Novelty and Originality of Contribution** (Frequency: High)
   - Importance: Introducing genuinely new concepts, methods, or frameworks significantly influences acceptance, as it advances the field or enables previously impossible approaches. Reviewers value creative ideas that go beyond incremental modifications.

2. **Strong Empirical or Theoretical Results/Analysis** (Frequency: High)
   - Importance: Solid, rigorous experimental or theoretical backing demonstrates the effectiveness and correctness of a proposed method, contributing to the credibility and utility of the research.

3. **Clear Motivation and Relevance of Problem** (Frequency: Medium)
   - Importance: Papers that tackle important or under-explored questions—especially those with real-world applications or theoretical significance—add value and are seen as deserving of acceptance.

4. **Thorough and Convincing Analysis (Including Ablation and Comparisons)** (Freque

2025-04-16 10:24:46,333 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
LLM Response:

### Top 10 Reasons for Acceptance

1. **Novelty and Originality of Contribution** (Frequency: High)
   - Importance: Introducing genuinely new concepts, methods, or frameworks significantly influences acceptance, as it advances the field or enables previously impossible approaches. Reviewers value creative ideas that go beyond incremental modifications.

2. **Strong Empirical or Theoretical Results/Analysis** (Frequency: High)
   - Importance: Solid, rigorous experimental or theoretical backing demonstrates the effectiveness and correctness of a proposed method, contributing to the credibility and utility of the research.

3. **Clear Motivation and Relevance of Problem** (Frequency: Medium)
   - Importance: Papers that tackle important or under-explored questions—especially those with real-world applications or theoretical significance—add value and are seen as deserving of acceptance.

4. **Thorough and Convincing Analysis (Including Ablation and Comparisons)** (Frequency: Medium)
   - Importance: Careful experimental design, comprehensive ablations, and meaningful comparisons to baselines show that improvements are attributable to the proposed work rather than confounding factors.

5. **Methodological Soundness and Justification** (Frequency: Medium)
   - Importance: Strong justification of technical choices and robust methodological design ensure that the proposed work is credible and not based on heuristics or weak assumptions.

6. **Significant Empirical Contribution (Datasets, Benchmarks, Evaluation)** (Frequency: Medium)
   - Importance: Efforts to provide new resources (datasets, benchmarks, open-source code) or especially thorough empirical studies are highly valued for their utility to the community and reproducibility.

7. **Successful Application to Challenging or Realistic Scenarios** (Frequency: Low-Medium)
   - Importance: Demonstrated success on challenging, diverse, or real-world benchmarks increases the perceived impact and usefulness of the proposed approach.

8. **Resolution of Reviewers’ Concerns Post-Rebuttal** (Frequency: Low)
   - Importance: Authors who adequately address critical issues raised during peer-review (e.g., clarify writing, add missing experiments) can tip borderline papers toward acceptance.

9. **Clarity and Quality of Presentation** (Frequency: Low)
   - Importance: Clear writing, well-structured arguments, and transparency in reporting results make contributions more accessible and trustworthy, although this tends to be secondary to content.

10. **Well-Motivated Incremental Improvements with Practical Impact** (Frequency: Low)
    - Importance: Even if the technical novelty is limited, presenting well-executed incremental improvements with clear practical value can still merit acceptance, especially if performance gains are significant and well-demonstrated.

---

### Top 10 Reasons for Rejection

1. **Insufficient Novelty / Incremental Contribution** (Frequency: Very High)
   - Importance: Papers that only offer marginal improvements or trivial modifications fail to move the field forward and are a primary reason for rejection.

2. **Weak or Unconvincing Empirical Results (Limited Benchmarks, Weak Baselines, Marginal Gains)** (Frequency: Very High)
   - Importance: Without strong evidence that a method works, or if improvements are not substantial or well-demonstrated, reviewers cannot endorse publication.

3. **Poor Comparison to State-of-the-art and Incomplete Baselines** (Frequency: High)
   - Importance: Failing to compare with existing leading methods makes it impossible to judge true value, suggesting either ignorance of the literature or lack of competitiveness.

4. **Lack of Theoretical Justification or Methodological Soundness** (Frequency: High)
   - Importance: Methods based on unjustified heuristics, dubious assumptions, or poorly explained choices are not convincing as scientific contributions.

5. **Insufficient or Poorly Described Experimental Analysis** (Frequency: High)
   - Importance: Limited, poorly documented, or unclear empirical evaluation (omitting essential details or critical experiments) undermines result validity and reproducibility.

6. **Unclear, Confusing, or Poor Presentation (Notation, Writing, Motivation)** (Frequency: High)
   - Importance: If key concepts, methods, or results are not clearly communicated, reviewers cannot properly assess the work, leading to rejection on the grounds of inaccessibility.

7. **Failure to Address Real-world Applicability or Limitations** (Frequency: Medium)
   - Importance: Overfocus on contrived, synthetic, or simplistic cases, or ignoring practical constraints, suggests the work has little impact beyond toy examples.

8. **Lack of Comparative or Qualitative Analysis (No Ablations or Analysis of Where Gains Come From)** (Frequency: Medium)
   - Importance: Not analyzing why or when a method works raises doubts as to its scientific merit and prevents drawing useful conclusions.

9. **Technical or Methodological Flaws (Unproven Claims, Incorrect Formalism, Flawed Assumptions)** (Frequency: Medium)
   - Importance: Foundational issues such as incorrect mathematical statements, unjustified assumptions, or errors in derivation mean the core contribution is unsound.

10. **Limited Scope, Impact, or Generality (Narrow Applicability, Simple Datasets, Lack of Breadth)** (Frequency: Medium)
    - Importance: If a method works only in very specific or unchallenging settings, or has unclear broader benefits, the overall contribution is limited.

---

### How These Were Determined

I synthesized themes across the provided summaries, clustering similar issues (e.g., "incremental improvement," "limited novelty," "marginal gains") under broader reasons. I prioritized reasons based on their criticality in peer review context: strong novelty, rigorous and meaningful results, soundness, and competitive evaluation are vital, while presentation, though important, is usually only a decisive factor when content is borderline. Many rejections cite multiple deficiencies; I emphasized those deemed “main reasons” or those directly phrased as decisive. For acceptance, the focus was on key strengths explicitly mentioned as justifying the positive score.

Frequency counts are secondary to the actual impact of each reason as signaled in the decisions. For instance, even if "presentation" is cited often, it rarely overrides fundamental flaws in novelty or results. Each reason is briefly annotated to explain why it tends to sway decisions.

# Map, then summarize

In [44]:
REJECTED_FEATURE_EXTRACTION_PROMPT = """
You are a data analyst. Your task is to extract the main negative comment from the review and any additional comments.
our input is a review of a paper that was rejected.
Here is the review text:
<REVIEW>
{{content}}
</REVIEW>
Please extract the main limitation as perceived by the reviewer, and any additional limitations and negative aspects.
Begin by returing your reasoning, then return the main limitation and any additional limitations.
Keep the reasoning and the answers brief, below 20 words.
Return the result in JSON format:
{   "reasoning": "<reasoning>",
    "main_limitation": "<main negative comment>",
    "additional_limitations": "<additional comments>"
}

"""

In [45]:
# for each review, extract the main negative comment and any additional comments

import time
from jinja2 import Template
template = Template(REJECTED_FEATURE_EXTRACTION_PROMPT)
answers = []
i = 0
for index, row in df_reject[:10].iterrows():
    review = row['review']
    prompt = template.render(content=review)
    try:
        raw_response = llm._call(prompt)
        response = json.loads(raw_response)

    except Exception as e:
        print(f"Error: {e}")
        print (f"Response: {response}")

    main_limitation = response['main_limitation']
    additional_limitations = response['additional_limitations']
    df_reject.at[index, 'main_limitation'] = main_limitation
    df_reject.at[index, 'additional_limitations'] = additional_limitations
    answers.append(response)
    if i % 10 == 0:
        print(f"Processed {i} reviews")
    i += 1


2025-04-15 23:29:29,915 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processed 0 reviews


2025-04-15 23:29:32,970 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-15 23:29:34,809 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-15 23:29:36,788 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-15 23:29:39,994 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-15 23:29:41,877 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-15 23:29:43,716 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-15 23:29:46,226 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-15 23:29:48,944 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-15 23:29:51,595 - INFO - HTTP Request: POST https://openrouter.a

In [None]:
# save df_reviews
# df_reject.to_csv('reject_reviews.csv', index=False)
# df_reject
# 
# df = pd.read_csv('reject_reviews.csv')
# df['main_limitation'].value_counts()  

main_limitation
None mentioned.                                                                                                                        2
None explicitly stated.                                                                                                                2
Lack of originality and significance; most results are known and not sufficiently distinguished from prior work.                       1
The title and abstract are misleading; paper doesn't introduce a new sampling strategy, only theoretical bounds for existing ones.     1
The bound has exponential dependence on the number of layers, making it less practical for real-world applications.                    1
                                                                                                                                      ..
Experiments are not convincing and the modification doesn’t provide much insight into representation learning/anomaly detection.       1
The experimental evaluati

## Path 1: Cluster and name, then build a hierarchy

In [52]:
df_reject = pd.read_csv('reject_reviews.csv')

In [76]:

start = time.time()
df_reject['text'] = df_reject['main_limitation'].astype(str)
dfr = df_reject[:1000]
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1, 2))
X = vectorizer.fit_transform(dfr['text']).toarray()

print(f'Vectorization took {time.time() - start} seconds')
start = time.time()



cluster_labels = hdbscan_kmeans_refine(
    X,
    hdbscan_min_cluster_size=10,
    max_cluster_size=50,
    split_k=5,
    max_total_clusters=1000,
    random_state=42,
    minimum_cluster_size=5)
print(f'Clustering took {time.time() - start} seconds')
dfr['cluster_main_limitation'] = cluster_labels

Vectorization took 0.03429985046386719 seconds
Clustering took 3.276634931564331 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['cluster_main_limitation'] = cluster_labels


### Name clusters

In [77]:


cluster_keywords, c_tfidf, count_vectorizer = get_cluster_keywords(
    dfr,'main_limitation',
    'cluster_main_limitation')

count_vectorizer = CountVectorizer(
    stop_words='english',max_features=1000,
    ngram_range=(1, 2))
count_vectorizer.fit(dfr['main_limitation'])
docs = get_representative_docs(dfr, 'main_limitation', 'cluster_main_limitation', cluster_keywords, count_vectorizer)




In [78]:
NAMING_PROMPT_TEMPLATE = """
You are a data analyst. Your task is to name the clusters based on the keywords and the representative documents.
Here are the keywords for each cluster:
<KEYWORDS>
{{keywords}}
</KEYWORDS>
Here are the representative documents for each cluster:
<REPRESENTATIVE_DOCS>
{{representative_docs}}
</REPRESENTATIVE_DOCS>
Output a JSON object with only one key called 'cluster_name', to denote the title of the group, with no additional information or commentary.
Do not leave the JSON empty. Always return a value for 'cluster_name' with your best guess at a descriptive title for the topic.
Be as specific as possible and mimic the style of the representative documents.
The JSON format is as follows:
    {   
        "cluster_name": <NAME>
    } 
    
"""

cmap = name_clusters_with_topics(dfr, llm, 'cluster_main_limitation', cluster_keywords, docs,  prompt_template=NAMING_PROMPT_TEMPLATE)



  0%|          | 0/55 [00:00<?, ?it/s]2025-04-16 11:02:30,301 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
  2%|▏         | 1/55 [00:03<02:46,  3.08s/it]2025-04-16 11:02:33,048 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
  4%|▎         | 2/55 [00:04<02:02,  2.32s/it]2025-04-16 11:02:34,885 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
  5%|▌         | 3/55 [00:06<01:51,  2.14s/it]2025-04-16 11:02:36,709 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
  7%|▋         | 4/55 [00:08<01:37,  1.91s/it]2025-04-16 11:02:38,334 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
  9%|▉         | 5/55 [00:11<01:50,  2.21s/it]2025-04-16 11:02:40,983 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
 11%|█         | 6/55 [00:14<02:07, 

In [81]:
dfr['_level_0'] = dfr['cluster_main_limitation'].map(cmap).fillna('NO_NAME')
dfr['_level_0'] = dfr['_level_0'].apply(robust_json_parse)
def get_dict_item(x):
    try:
        value = list(x.values())[0]
        return value
    except Exception as e:
        print(f"Error: {e}")
        print (f"Response: {x}")
        return x
dfr['_level_0'] = dfr['_level_0'].apply(get_dict_item)
dfr['_level_0']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['_level_0'] = dfr['cluster_main_limitation'].map(cmap).fillna('NO_NAME')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['_level_0'] = dfr['_level_0'].apply(robust_json_parse)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['_level_0'] = dfr['_level_0'].apply(get_dict_item)


0      Assessment of Novelty and Rigor through Compar...
1      Robustness Analysis of Deep Neural Networks ag...
2      Critiques of Theoretical Paper Clarity and Exp...
3      Novel Approaches and Incremental Advances in C...
4      Assessment of Novelty and Significance in Rela...
                             ...                        
995    Assessment of Methodological Novelty and Techn...
996    Evaluation and Benchmarking of Clustering Adap...
997    Assessment of Novel Model Integration Techniqu...
998    Assessment of Novelty and Rigor through Compar...
999    Critiques of Incremental Optimization Methods ...
Name: _level_0, Length: 1000, dtype: object

In [80]:
dfr.to_csv('reject_reviews_w_base_cluster.csv', index=False)

### Raw reasons

In [82]:
for reason in dfr['main_limitation'].unique():
    print(reason)

Lack of rigorous experimentation and insufficient evaluation of generalizability across datasets and comparison with prior work.
Experiments do not show state-of-the-art results, especially on standard datasets like MNIST and CIFAR.
Paper gives very little theoretical justification or analysis of the results.
The contribution of this paper is very incremental.
Concern about the novelty of the ideas, describing them as straightforward after considering recent related work.
The paper misses very relevant prior papers with similar ideas, failing to properly position its contribution.
Maximal safe set is only illustrated for one CIFAR-100 case, which is not convincing.
It is not shown that the PE score can improve results of image generation tasks, such as super-resolution.
Correlation between PE score and human judgement is unsurprising because CSF already encodes human perceptual characteristics.
Experiments are anecdotal, incomplete, and not very exhaustive.
Unclear connection between p

### Clusters, Level zero

In [83]:
for reason in dfr['_level_0'].unique():
    print(reason)

Assessment of Novelty and Rigor through Comparison with Prior Work
Robustness Analysis of Deep Neural Networks against Adversarial Attacks
Critiques of Theoretical Paper Clarity and Experimental Design
Novel Approaches and Incremental Advances in Community Detection Methods
Assessment of Novelty and Significance in Relation to Prior Work
Validated Mathematical Results and Proof-Based Safety Analyses
Assumptions and Rationales in Model Correctness and Recommendations
Continuous Space Modeling and Policy Correlation Analysis
Empirical Evaluation of Tensor Completion Methods
Model Comparisons and Clarity Issues in MNIST Classification
Critique of Insufficient Experimental Validation
Evaluation and Comparison of Hyperparameter Methods: Clarity, Explanations, and Batch Results
Technical Contributions and Comparative Analysis in Academic Art Research
Assessment of Methodological Novelty and Technical Extensions
Evaluation Limitations and Weak Baseline Analysis
Experimental Evaluation of Mode

In [84]:

# dfr.to_csv('reject_reviews_clustered.csv', index=False)
# dfr = pd.read_csv('reject_reviews_clustered.csv')

SYNTHESIS_NAMING_PROMPT_TEMPLATE = '''
you are an program committee chair of a scientific conference.
Your task is to identify the main issues in a list of reviews, and to group them into higher level groups.
This is an ENUMERATED list of reviews, in the format NUMBER: REVIEW. 
Your job is to reduce the list to a smaller number of ISSUEs raised by the reviewers. 
Be as specific as possible. 
The output is a list of GROUP_NAME that are at a higher level of abstaction and that group together at least two underlying issues.
Try to identify groups that put together a minimum of two, but ideally four or five similar issues, and give them a name that is descriptive of the group and of the differences between
items in that group and items in other groups. 
along with a mapping of the original ISSUE_NUMBERs to the new GROUP_NAME.
Only return the map if it contains more than one ISSUE_NUMBERs, that is, if we can actually group some issues into a higher level pattern.
Make sure every ISSUE_NUMBER is assigned to a group, and make an effort to group every issue into a higher level group.

These is the enumerated list of issues:
<ENUMERATED_ISSUE_LIST>
{{text_to_synthesize}}
</ENUMERATED_ISSUE_LIST>

First, briefly describe your reasoning, and then return a JSON dict object. The answer should have the following structure:
REASONING: <YOUR REASONING>
JSON GROUPS:
{
  "<GROUP_NAME_1>": ["<ISSUE_NUMBER>", .., "<ISSUE_NUMBER>"],
  "<GROUP_NAME_2>": ["<ISSUE_NUMBER>" .., "<ISSUE_NUMBER>"],
}
Please adhere to the format described in the JSON part of the answer: each json key must contain a list of numbers corresponding to the ISSUE_IDs to be grouped under that key identifying the group name.
'''

In [None]:

dfr_h = build_cluster_hierarchy(dfr, llm, '_level_0', prompt_template = SYNTHESIS_NAMING_PROMPT_TEMPLATE, MAX_TOP_LEVEL_CLUSTERS=10)
print_clusters(dfr_h, 'main_limitation', 'hierarchy.csv') 

- ### Clarity of Presentation, Explanation, and Motivation (L1, 60)
  - #### Ambiguities and Misuses in Probabilistic Modeling Papers (L0, 7)
    * Examples:
      - The paper misuses the reparameterization trick; Eq.(2) lacks required KL-divergence term, making the approach unjustified.
      - The section on the bernoulli splitting trick was the most confusing part of the paper.
      - Confusing motivation, especially regarding reward ambiguity, and lack of clear explanation in the paper.
  - #### Challenges in Understanding and Interpreting Complex Papers (L0, 14)
    * Examples:
      - Writing is sloppy and difficult to read; main claim about i.i.d. hypothesis invalidating GP interpretation is not convincing.
      - Paper is tedious with heavy notations, making it difficult to read.
      - It is difficult to understand the paper; it is not clearly written, especially the algorithm's presentation.
  - #### Clarifying Writing and Explanation Confusion in Variable Scanning Approaches (L0, 7)
    * Examples:
      - Motivation and explanation are insufficient; rationale for the approach is unclear.
      - The definition and explanation of Equation 2 and variable z are unclear and confusing.
      - The writing can be significantly improved; technical parts are confusing and unclear.
  - #### Clarity and Motivation in Mathematical Decision-Making (L0, 8)
    * Examples:
      - Unclear motivation for using tropical geometry and misleading use of 'adversarial attacks' terminology.

In [86]:
# dfr.to_csv

## Path 2: cluster and synthesize

Basically we build hierarchy from the main level, without the initial clustering and naming step to go to L0. All clustering is done by the LLM


In [None]:
# from hc import build_cluster_hierarchy, print_clusters

SYNTHESIS_NAMING_FROMWK_PROMPT_TEMPLATE = '''
you are an program committee chair of a scientific conference.
Your task is to group  a list of weaknesses reviewers found in papers into higher level groups.
This is an ENUMERATED list of weaknesses, in the format WEAKNESS_NUMBER: WEAKNESS. 
Your job is to reduce the list to a smaller number of WEAKNESSs raised by the reviewers. 
Be as specific as possible. 
The output is a list of GROUP_NAME that are at a higher level of abstaction and that group together at least two underlying weaknesses.
Try to identify groups that put together a minimum of two, but ideally many similar weaknesses, and give them a name that is descriptive of the group and of the differences between
items in that group and items in other groups, along with a mapping of the original WEAKNESS_NUMBER to the new GROUP_NAME.
Only return the map if it contains more than one WEAKNESS_NUMBER, that is, if we can actually group some issues into a higher level pattern.
Make sure every WEAKNESS_NUMBER is assigned to a group, and make an effort to group every issue into a higher level group.

These is the enumerated list of issues:
<ENUMERATED_ISSUE_LIST>
{{text_to_synthesize}}
</ENUMERATED_ISSUE_LIST>

First, briefly describe your reasoning, and then return a JSON dict object. The answer should have the following structure:
REASONING: <YOUR REASONING>
JSON GROUPS:
{
  "<GROUP_NAME_1>": ["<WEAKNESS_NUMBER>", .., "<WEAKNESS_NUMBER>"],
  "<GROUP_NAME_2>": ["<WEAKNESS_NUMBER>" .., "<WEAKNESS_NUMBER>"],
}
Please adhere to the format described in the JSON part of the answer: each json key must contain a list of numbers corresponding to the ISSUE_IDs to be grouped under that key identifying the group name.
Do not include special characters in the answer besides the JSON format.
'''

dfr_cs = pd.read_csv('reject_reviews.csv')
print (len(dfr_cs))
# dfr_cs = dfr.copy()
dfr_cs['_level_0'] = dfr_cs['main_limitation'].astype(str)

dfr_cs_h = build_cluster_hierarchy(dfr_cs, llm, '_level_0', prompt_template = SYNTHESIS_NAMING_FROMWK_PROMPT_TEMPLATE, MAX_TOP_LEVEL_CLUSTERS=10)
print_clusters(dfr_cs_h, 'main_limitation', 'hierarchy_cs.csv')

In [14]:
print_clusters(dfr_cs_h, 'main_limitation', 'hierarchy_cs.csv')

Building Tree...
Detected and using levels (highest to lowest): ['_level_4', '_level_3', '_level_2', '_level_1', '_level_0']

Printing Tree Structure:
- Correlation between PE score and human judgement is unsurprising because CSF already encodes human perceptual characteristics. (L4, 1)
  * Examples:
    - Correlation between PE score and human judgement is unsurprising because CSF already encodes human perceptual characteristics.
- Ethical and Policy Violations (L4, 5)
  - Blind Review or Policy Violations (L3, 2)
    - Violates blind-review policy by exposing author identity in acknowledgements. (L2, 1)
      * Examples:
        - Violates blind-review policy by exposing author identity in acknowledgements.
    - Violating double blind rule. (L2, 1)
      * Examples:
        - Violating double blind rule.
  - Ethical Issues and Scientific Integrity (Plagiarism, Double-Blind Violations) (L3, 2)
    - Theorem 2 and its proof are plagiarized and presented as the authors' own work. (L2, 

Building Tree...
Detected and using levels (highest to lowest): ['_level_4', '_level_3', '_level_2', '_level_1', '_level_0']

Printing Tree Structure:
Warning: LEVELS_TO_PRINT (6) is greater than total levels available (5). Printing all levels.
- Correlation between PE score and human judgement is unsurprising because CSF already encodes human perceptual characteristics. (L4, 1)
  * Examples:
    - Correlation between PE score and human judgement is unsurprising because CSF already encodes human perceptual characteristics.
- Ethical and Policy Violations (L4, 5)
  - Blind Review or Policy Violations (L3, 2)
    - Violates blind-review policy by exposing author identity in acknowledgements. (L2, 1)
      * Examples:
        - Violates blind-review policy by exposing author identity in acknowledgements.
    - Violating double blind rule. (L2, 1)
      * Examples:
        - Violating double blind rule.
  - Ethical Issues and Scientific Integrity (Plagiarism, Double-Blind Violations) (L3, 2)
    - Theorem 2 and its proof are plagiarized and presented as the authors' own work. (L2, 1)
      * Examples:
        - Theorem 2 and its proof are plagiarized and presented as the authors' own work.
    - Violation of the double blind review rule and similarity to previously published work. (L2, 1)
      * Examples:
        - Violation of the double blind review rule and similarity to previously published work.
  - The paper was already accepted at a Neuro AI workshop, violating double-blind review rules. (L3, 1)
    * Examples:
      - The paper was already accepted at a Neuro AI workshop, violating double-blind review rules.
- Methodology and Scientific Rigor (L4, 745)
  - Experimental Validation and Evaluation Design Issues (L3, 378)
    - Comparison, Evaluation, and Experimental Design Weaknesses (L2, 7)
      - ARAC seeks diversity in fitness space, which is weaker than behavior/outcome space used in QD and NS. (L1, 1)
        * Examples:
          - ARAC seeks diversity in fitness space, which is weaker than behavior/outcome space used in QD and NS.
      - Authors should compare using QAP for assignment with GSE to better assess their method. (L1, 1)
        * Examples:
          - Authors should compare using QAP for assignment with GSE to better assess their method.
      - Performance drops sharply after 50% pruning across all datasets (Figure 4). (L1, 1)
        * Examples:
          - Performance drops sharply after 50% pruning across all datasets (Figure 4).
      - TVmax achieves only a little improvement over the baseline (softmax) on automated metrics. (L1, 1)
        * Examples:
          - TVmax achieves only a little improvement over the baseline (softmax) on automated metrics.
      - The method was not tested on other NLP benchmarks like text classification or NER. (L1, 1)
        * Examples:
          - The method was not tested on other NLP benchmarks like text classification or NER.
      - Weak Integration between Theory and Experiments (L1, 2)
        - The theoretical and empirical parts are not well-connected; variance of the estimator is not quantified. (L0, 1)
          * Examples:
            - The theoretical and empirical parts are not well-connected; variance of the estimator is not quantified.
        - Theoretical and empirical analyses are not sufficiently novel, deep, or informative. (L0, 1)
          * Examples:
            - Theoretical and empirical analyses are not sufficiently novel, deep, or informative.
    - Computational Inefficiency and Resource Overhead (L2, 3)
      - The method introduces more memory overhead compared to GPipe. (L1, 1)
        * Examples:
          - The method introduces more memory overhead compared to GPipe.
      - The method is computationally costly and sacrifices network flexibility, being a brute-force application without practical remedies. (L1, 1)
        * Examples:
          - The method is computationally costly and sacrifices network flexibility, being a brute-force application without practical remedies.
      - The method requires storing an increasing number of models, leading to huge storage costs, making it economically infeasible. (L1, 1)
        * Examples:
          - The method requires storing an increasing number of models, leading to huge storage costs, making it economically infeasible.
    - Evaluation Metrics, Ablation Studies, and Analysis Limitations (L2, 6)
      - Missing Evaluation Metrics/Uncertainty/Ablation Studies (L1, 4)
        - Experimental validation is weak: no proper baselines, limited datasets, and unclear evaluation metrics. (L0, 1)
          * Examples:
            - Experimental validation is weak: no proper baselines, limited datasets, and unclear evaluation metrics.
        - Lack of ablation experiments for FALCON's components and need for experimental justification of convolution order. (L0, 1)
          * Examples:
            - Lack of ablation experiments for FALCON's components and need for experimental justification of convolution order.
        - Lack of uncertainty (e.g., error bars) in experimental plots. (L0, 1)
          * Examples:
            - Lack of uncertainty (e.g., error bars) in experimental plots.
        - The experimental evaluation is insufficient, lacking details about TensorFlow comparison and missing ablation studies. (L0, 1)
          * Examples:
            - The experimental evaluation is insufficient, lacking details about TensorFlow comparison and missing ablation studies.
      - Unvalidated or Questionable Evaluation Metrics and Analysis (L1, 2)
        - No standard deviation reported for results; possible high variance in outcomes when using low bit quantization. (L0, 1)
          * Examples:
            - No standard deviation reported for results; possible high variance in outcomes when using low bit quantization.
        - The new ‘concept accuracy’ metric is unvalidated and poorly correlated with interpretability, making the results unconvincing. (L0, 1)
          * Examples:
            - The new ‘concept accuracy’ metric is unvalidated and poorly correlated with interpretability, making the results unconvincing.
    - Evaluation, Empirics, and Implementation Issues (L2, 19)
      - Dataset and Evaluation Limitations (L1, 4)
        - None explicitly stated. (L0, 2)
          * Examples:
            - None explicitly stated.
            - None explicitly stated.
        - None stated. (L0, 1)
          * Examples:
            - None stated.
        - PUGAN was tested only on a simple dataset and quality is only comparable to WaveGAN, not advancing fidelity. (L0, 1)
          * Examples:
            - PUGAN was tested only on a simple dataset and quality is only comparable to WaveGAN, not advancing fidelity.
      - Insufficient discussion on why the model underperforms for Russian in word-translation retrieval. (L1, 1)
        * Examples:
          - Insufficient discussion on why the model underperforms for Russian in word-translation retrieval.
      - Model Efficiency and Hardware Realism Issues (L1, 4)
        - ESN is not suitable for existing hardware; it relies on unrealistic assumptions. (L0, 1)
          * Examples:
            - ESN is not suitable for existing hardware; it relies on unrealistic assumptions.
        - It is unclear how speed-up is calculated and whether MAC reduction correlates to actual hardware speed-up. (L0, 1)
          * Examples:
            - It is unclear how speed-up is calculated and whether MAC reduction correlates to actual hardware speed-up.
        - The computational efficiency of conventional NLP models is well known to researchers. (L0, 1)
          * Examples:
            - The computational efficiency of conventional NLP models is well known to researchers.
        - Using total number of bits as equivalent measure between models doesn't account for actual inference latency or efficiency on hardware. (L0, 1)
          * Examples:
            - Using total number of bits as equivalent measure between models doesn't account for actual inference latency or efficiency on hardware.
      - Poor Generalization or Overfitting to Experimental Setting (L1, 2)
        - Accelerated diffusion causes over-smoothing quickly, limiting the feasible number of layers and deep network non-linearity. (L0, 1)
          * Examples:
            - Accelerated diffusion causes over-smoothing quickly, limiting the feasible number of layers and deep network non-linearity.
        - Generalization is unconvincing because both tasks use the same environments and features. (L0, 1)
          * Examples:
            - Generalization is unconvincing because both tasks use the same environments and features.
      - Reproducibility and Supporting Materials Issues (L1, 3)
        - Lacks many details in key sections, preventing full appreciation and making reproducibility impossible. (L0, 1)
          * Examples:
            - Lacks many details in key sections, preventing full appreciation and making reproducibility impossible.
        - Reproducibility is difficult as source code and some datasets are not provided or are not public. (L0, 1)
          * Examples:
            - Reproducibility is difficult as source code and some datasets are not provided or are not public.
        - The paper suffers from an incomplete evaluation, so acceptance is not recommended. (L0, 1)
          * Examples:
            - The paper suffers from an incomplete evaluation, so acceptance is not recommended.
      - Superficiality or Simplistic Experimental Setting (L1, 5)
        - Exposition is difficult to follow and experiments are not very compelling. (L0, 1)
          * Examples:
            - Exposition is difficult to follow and experiments are not very compelling


### add a refinement step

In [30]:
import pandas as pd
from jinja2 import Template

# Sample data representing the cluster hierarchy tree.

# Define a jinja2 template to create the prompt.
prompt_template = """
You are given a cluster hierarchy tree where each row represents a series of cluster names from the bottom level (leaf, _level_0) to higher levels (_level_1, _level_2, etc.). The current hierarchy is as follows:

{% for index, row in df.iterrows() %}
Row {{ index + 1 }}:
{% for level in levels %}
  {{ level }}: "{{ row[level] }}"
{% endfor %}
{% endfor %}

Please refine this cluster hierarchy. In your refined hierarchy, modify names if necessary and merge similar clusters (i.e. merge nodes in the tree that represent similar themes) while preserving the hierarchical structure.

Return the refined hierarchy in a clear, structured format.
"""
df_remap = dfr_cs_h[[c for c in dfr_cs_h.columns if 'level_' in c]] 
# List the hierarchy levels in order (assuming the DataFrame columns are ordered as the hierarchy).
levels = list(df_remap.columns)

# Render the prompt using the jinja2 template.
template = Template(prompt_template)
prompt = template.render(df=df_remap, levels=levels)

# # Log the prompt (for debugging purposes) before sending it to the LLM.
# print("Prompt sent to the LLM:")
# print(prompt)

# Call the LLM with the prompt.
# Assume that 'llm' is an instance of your LLM class with _call implemented.
refined_hierarchy = llm._call(prompt)

print("\nLLM Response:")
print(refined_hierarchy)


2025-04-16 12:20:04,897 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



LLM Response:
Certainly! Refining such a large and complex cluster hierarchy involves combining similar nodes and standardizing naming conventions to improve clarity and reduce redundancy. Below is a refined, hierarchical structure covering the main themes evident in the original clusters. Each level is presented using standardized, non-overlapping names, and represents a specific granularity of critique, going from fine-grained (bottom) to broad (top).

**Refined Cluster Hierarchy**

---

## Level 4 (Top-level Themes)

1. **Methodological Soundness & Novelty**
2. **Experimental and Empirical Validation**
3. **Clarity, Presentation, & Reproducibility**
4. **Positioning & Related Work**
5. **Practical Impact & Generalizability**
6. **Ethical and Policy Compliance**
7. **Review Process / No Substantive Weakness**

---

## Level 3 (Mid-level Themes)

### (1) Methodological Soundness & Novelty
- Theoretical Rigor & Mathematical Foundations
- Methodological or Analytical Issues
- Algorithm

2025-04-16 12:20:04,897 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"

**LLM Response:**

Certainly! Refining such a large and complex cluster hierarchy involves combining similar nodes and standardizing naming conventions to improve clarity and reduce redundancy. Below is a refined, hierarchical structure covering the main themes evident in the original clusters. Each level is presented using standardized, non-overlapping names, and represents a specific granularity of critique, going from fine-grained (bottom) to broad (top).

**Refined Cluster Hierarchy**

---

## Level 4 (Top-level Themes)

1. **Methodological Soundness & Novelty**
2. **Experimental and Empirical Validation**
3. **Clarity, Presentation, & Reproducibility**
4. **Positioning & Related Work**
5. **Practical Impact & Generalizability**
6. **Ethical and Policy Compliance**
7. **Review Process / No Substantive Weakness**

---

## Level 3 (Mid-level Themes)

### (1) Methodological Soundness & Novelty
- Theoretical Rigor & Mathematical Foundations
- Methodological or Analytical Issues
- Novelty Evaluation

### (2) Experimental and Empirical Validation
- Experimental Design and Setup
- Data, Baselines, and Metrics Comparison
- Reproducibility of Results

### (3) Clarity, Presentation, & Reproducibility
- Organization, Writing, and Visualizations
- Documentation and Code Availability
- Interpretability and Usability

### (4) Positioning & Related Work
- Literature Review and Contextualization
- Comparison with Prior Work

### (5) Practical Impact & Generalizability
- Applicability to Real-world Problems
- Scalability and Performance in Practice
- Generalization Across Domains

### (6) Ethical and Policy Compliance
- Adherence to Ethical Guidelines
- Blind Review and Anonymity Issues

### (7) Review Process / No Substantive Weakness
- Reviewer Feedback on Process and Expertise
- General Comments on Absence of Major Issues

---

Let me know if you'd like the hierarchy in a particular data structure (JSON, YAML, indented text, etc.), or a visualization!

*Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...*


## Path 3: map-reduce

In [None]:

from jinja2 import Template
MR_PROMPT_TEMPLATE = """
Given the following cluster names:

{{ current_names }}

and the previously identified higher-level concepts:

{{ previously_identified_names }}

Identify and return a new set of higher-level cluster names. Provide names clearly representing broader categories.
"""


In [None]:
from hc import iterative_higher_level_clustering

initial_names = dfr_cs['main_limitation'].unique().tolist()
final_names = iterative_higher_level_clustering(llm, initial_names, prompt_template=MR_PROMPT_TEMPLATE, batch_size=30)
print("Final high-level clusters:", final_names)


In [None]:
for f in final_names:
    print(f)

Certainly! Here is a clear, synthesized set of **higher-level cluster names** that broadly capture the criticisms present in your cluster names:

---

### 1. Limited Novelty and Contribution
*Captures: Lack of innovation, incremental advances, over-reliance on existing methods, and unclear advancement over prior work.*

---

### 2. Methodological and Technical Weaknesses
*Includes: Questionable assumptions, insufficient methodological justification, unconvincing technical claims, and theoretical or experimental design flaws.*

---

### 3. Insufficient Experimental Validation
*Encompasses: Inadequate empirical comparison, unfair or misleading experimental design, lack of comparative analysis with relevant methods, and poor result reporting.*

---

### 4. Poor Clarity and Justification
*Represents: Unclear explanations, lack of justification for methodological choices, and insufficient discussion of the method’s necessity or motivation.*

---

These four categories should succinctly and comprehensively cover the broader areas reflected in your original cluster names and feedback.

## Variations on map-reduce prompt and batch
Larger batch, and asking for a specific number of high level clusters

In [None]:

from jinja2 import Template
MR_PROMPT_TEMPLATE = """
Given the following cluster names:

{{ current_names }}

and the previously identified higher-level concepts:

{{ previously_identified_names }}

Identify and return a new set of higher-level cluster names. Provide names clearly representing broader categories.
Limit to about 10 maximum higher level concepts
"""
dfr_cs = pd.read_csv('reject_reviews_clustered.csv')

initial_names = dfr_cs['main_limitation'].unique().tolist()
final_names_v2 = iterative_higher_level_clustering(llm, initial_names, prompt_template=MR_PROMPT_TEMPLATE, batch_size=60)
# print("Final high-level clusters:", final_names)

for f in final_names_v2:
    print(f)

Certainly! Given your list of fine-grained cluster names (critical comments), and the *format/style* of your "previously identified higher-level concepts," here is a **new, succinct set of higher-level cluster names** that are *mutually exclusive* and *collectively exhaustive*, with phrasing tailored for general utility and clarity. Below you'll find the 10 umbrella categories:

---

### 1. Lack of Novelty or Significant Contribution
*Paper presents only incremental advances, is derivative, or lacks meaningful new insights.*

---

### 2. Insufficient Experimental Validation
*Experimental results are weak, limited, unconvincing, or lack breadth (e.g., small datasets, poor baselines, inadequate testing).*

---

### 3. Poor Clarity and Presentation
*Writing is unclear, poorly structured, hard to follow, or imprecise, making understanding difficult.*

---

### 4. Inadequate Theoretical Justification
*Theory is missing, weak, inapplicable, or lacks rigorous motivation and support.*

---

### 5. Lack of Methodological Detail
*Key algorithmic or procedural details are omitted or described too vaguely to understand or verify the work.*

---

### 6. Inadequate Comparison with Related Work
*Paper does not sufficiently compare with existing methods (theoretical or empirical) or neglects relevant baselines.*

---

### 7. Overstated or Unsupported Claims
*Assertions and conclusions are not justified by the evidence provided (data, experiments, or theory).*

---

### 8. Limited Practical Scope or Generalization
*Applicability is too narrow, restricted to toys or unrealistic settings, or lacks evidence of broader impact.*

---

### 9. Problems with Evaluation or Fairness
*Comparisons are unfair, experimental protocols are inconsistent, or evaluation metrics are inappropriate.*

---

### 10. Not Ready for Publication or Inappropriate for Venue
*Work is too preliminary, underdeveloped, unpolished, or does not fit the conference/journal scope.*

---

**These categories provide a distinct and comprehensive set of top-level criticism types for reviewing and analysis.**

# Cluster without maps

## Cluster and name, then build hierarchy

In [25]:
from hc import hdbscan_kmeans_refine

NAMING_PROMPT_TEMPLATE = """
You are a program chair of a scientific conference. 
Given a set of keywords and representative examples of paper review, your job is to extract the main positive or negative comment described.
Here are the keywords for each cluster:
<KEYWORDS>
{{keywords}}
</KEYWORDS>
Here are the representative documents for each cluster:
<REPRESENTATIVE_DOCS>
{{representative_docs}}
</REPRESENTATIVE_DOCS>
Output a JSON object with only one key called 'cluster_name', to denote the title of the group, with no additional information or commentary.
Do not leave the JSON empty. Always return a value for 'cluster_name' with your best guess at a descriptive title for the topic.
Be as specific as possible and mimic the style of the representative reviews examples.
The JSON format is as follows:
    {   
        "cluster_name": <NAME>
    } 
Do not include special characters in the answer besides the JSON format. Make sure to delimit the JSON keys with double quotes.
"""



def cluster_and_name(df):
    df['text'] = df['review'].astype(str)
    dfr = df[:1000]
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1, 2))
    X = vectorizer.fit_transform(dfr['text']).toarray()


    cluster_labels = hdbscan_kmeans_refine(
        X,
        hdbscan_min_cluster_size=10,
        max_cluster_size=50,
        split_k=5,
        max_total_clusters=1000,
        random_state=42,
        minimum_cluster_size=5)

    dfr['review_cluster'] = cluster_labels


    cluster_keywords, c_tfidf, count_vectorizer = get_cluster_keywords(
        dfr,'review',
        'review_cluster')

    count_vectorizer = CountVectorizer(
        stop_words='english',max_features=1000,
        ngram_range=(1, 2))
    count_vectorizer.fit(dfr['review'])
    docs = get_representative_docs(dfr, 'review', 'review_cluster', cluster_keywords, count_vectorizer)



    cmap = name_clusters_with_topics(dfr, llm, 'review_cluster', cluster_keywords, docs,  prompt_template=NAMING_PROMPT_TEMPLATE)
    print(cmap)
    dfr['_level_0'] = dfr['review_cluster'].map(cmap).fillna('NO_NAME')
    dfr['_level_0'] = dfr['_level_0'].apply(robust_json_parse)
    def get_dict_item(x):
        try:
            value = list(x.values())[0]
            return value
        except Exception as e:
            print(f"Error: {e}")
            print (f"Response: {x}")
            return x

    dfr['_level_0'] = dfr['_level_0'].apply(get_dict_item)
    dfr['_level_0']
    return dfr, cmap

dfr_review, cmap = cluster_and_name(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['review_cluster'] = cluster_labels
  0%|          | 0/51 [00:00<?, ?it/s]2025-04-16 12:14:58,656 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
  2%|▏         | 1/51 [00:07<06:20,  7.61s/it]2025-04-16 12:15:00,066 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
  4%|▍         | 2/51 [00:11<04:36,  5.64s/it]2025-04-16 12:15:04,374 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
  6%|▌         | 3/51 [00:17<04:35,  5.75s/it]2025-04-16 12:15:11,174 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
  8%|▊         | 4/51 [00:20<03:26,  4.38s/it]2025-04-16

{64: '{   \n    "cluster_name": "Sliding Window Methods for Temporal Pattern Detection in Sequential Data"\n}', 12: '{\n    "cluster_name": "Gradient-based Architecture Search Methods and Optimization Techniques"\n}', 52: '{\n    "cluster_name": "Falcon Method and Quantization Techniques for Learning with PDF Trees"\n}', 2: '{\n    "cluster_name": "Unsupervised Domain Adaptation Methods and Evaluation"\n}', 41: '{\n    "cluster_name": "Deep Neural Network Training Methods and LBI Approaches"\n}', 63: '{   \n    "cluster_name": "Evaluation of Methods for Noise Handling in Labeled Data"\n}', 23: '{\n    "cluster_name": "Underwater Image Super-Resolution with GANs"\n}', 24: '{\n    "cluster_name": "Audio Feature Analysis and Spectrogram Methods"\n}', 29: '{\n    "cluster_name": "3D Generative Models and VAE Methods"\n}', 11: '{\n    "cluster_name": "Neural Network Training and Convergence Analysis"\n}', 14: '{\n    "cluster_name": "Gradient Normalization Techniques and Variance Reduction 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['_level_0'] = dfr['review_cluster'].map(cmap).fillna('NO_NAME')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['_level_0'] = dfr['_level_0'].apply(robust_json_parse)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['_level_0'] = dfr['_level_0'].apply(get_dict_item)


In [26]:
for reason in   dfr_review['_level_0']:
    print(reason)

Sliding Window Methods for Temporal Pattern Detection in Sequential Data
Gradient-based Architecture Search Methods and Optimization Techniques
Falcon Method and Quantization Techniques for Learning with PDF Trees
Unsupervised Domain Adaptation Methods and Evaluation
Falcon Method and Quantization Techniques for Learning with PDF Trees
Deep Neural Network Training Methods and LBI Approaches
Evaluation of Methods for Noise Handling in Labeled Data
Underwater Image Super-Resolution with GANs
Audio Feature Analysis and Spectrogram Methods
Underwater Image Super-Resolution with GANs
Audio Feature Analysis and Spectrogram Methods
Falcon Method and Quantization Techniques for Learning with PDF Trees
Gradient-based Architecture Search Methods and Optimization Techniques
3D Generative Models and VAE Methods
Neural Network Training and Convergence Analysis
Gradient Normalization Techniques and Variance Reduction in Deep Learning
Gradient Normalization Techniques and Variance Reduction in Deep L

In [None]:

SYNTHESIS_NAMING_PROMPT_TEMPLATE = '''
you are an program committee chair of a scientific conference.
Your task is to identify the main issues in a list of reviews, and to group them into higher level groups.
This is an ENUMERATED list of reviews, in the format NUMBER: REVIEW. 
Your job is to reduce the list to a smaller number of ISSUEs raised by the reviewers. 
Be as specific as possible. 
The output is a list of GROUP_NAME that are at a higher level of abstaction and that group together at least two underlying issues.
Try to identify groups that put together a minimum of two, but ideally many similar issues, and give them a name that is descriptive of the group and of the differences between
items in that group and items in other groups. 
along with a mapping of the original ISSUE_NUMBERs to the new GROUP_NAME.
Only return the map if it contains more than one ISSUE_NUMBERs, that is, if we can actually group some issues into a higher level pattern.
Make sure every ISSUE_NUMBER is assigned to a group, and make an effort to group every issue into a higher level group.

These is the enumerated list of issues:
<ENUMERATED_ISSUE_LIST>
{{text_to_synthesize}}
</ENUMERATED_ISSUE_LIST>

First, briefly describe your reasoning, and then return a JSON dict object. The answer should have the following structure:
REASONING: <YOUR REASONING>
JSON GROUPS:
{
  "<GROUP_NAME_1>": ["<ISSUE_NUMBER>", .., "<ISSUE_NUMBER>"],
  "<GROUP_NAME_2>": ["<ISSUE_NUMBER>" .., "<ISSUE_NUMBER>"],
}
Please adhere to the format described in the JSON part of the answer: each json key must contain a list of numbers corresponding to the ISSUE_IDs to be grouped under that key identifying the group name.
Do not include special characters in the answer besides the JSON format.
'''

dfr_review = build_cluster_hierarchy(dfr_review, llm, '_level_0', prompt_template = SYNTHESIS_NAMING_PROMPT_TEMPLATE, MAX_TOP_LEVEL_CLUSTERS=10)


In [28]:
print_clusters(dfr_review, 'review', 'hierarchy.csv') 

Building Tree...
Detected and using levels (highest to lowest): ['_level_2', '_level_1', '_level_0']

Printing Tree Structure:
- Evaluation, Benchmarking, and Theoretical Analysis (L2, 191)
  - Evaluation and Benchmarking of Learning Methods (L1, 131)
    - Evaluation of Few-Shot and Self-Supervised Learning Methods for Classification and Anomaly Detection (L0, 40)
      * Examples:
        -  While this paper tackles an interesting problem. The technical approach is unfortunately too outdated and obvious and not quite the level of ICLR. The dataset is likely too easy given the high accuracy.
        -  Summary: This paper considers the addition of self-supervised learning techniques in the few-shot learning setting. Extensive experiments are done to show that it can be helpful, including in cases where the labeled data is corrupted. The paper also considers the domain mismatch issue where unlabeled images come from a different domain. Review: This paper is thorough and clearly written

### Add a refinement step

In [29]:
import pandas as pd
from jinja2 import Template

# Sample data representing the cluster hierarchy tree.

# Define a jinja2 template to create the prompt.
prompt_template = """
You are given a cluster hierarchy tree where each row represents a series of cluster names from the bottom level (leaf, _level_0) to higher levels (_level_1, _level_2, etc.). The current hierarchy is as follows:

{% for index, row in df.iterrows() %}
Row {{ index + 1 }}:
{% for level in levels %}
  {{ level }}: "{{ row[level] }}"
{% endfor %}
{% endfor %}

Please refine this cluster hierarchy. In your refined hierarchy, modify names if necessary and merge similar clusters (i.e. merge nodes in the tree that represent similar themes) while preserving the hierarchical structure.

Return the refined hierarchy in a clear, structured format.
"""
df_remap = dfr_review[[c for c in dfr_review.columns if 'level_' in c]] 
# List the hierarchy levels in order (assuming the DataFrame columns are ordered as the hierarchy).
levels = list(df_remap.columns)

# Render the prompt using the jinja2 template.
template = Template(prompt_template)
prompt = template.render(df=df_remap, levels=levels)

# # Log the prompt (for debugging purposes) before sending it to the LLM.
# print("Prompt sent to the LLM:")
# print(prompt)

# Call the LLM with the prompt.
# Assume that 'llm' is an instance of your LLM class with _call implemented.
refined_hierarchy = llm._call(prompt)

print("\nLLM Response:")
print(refined_hierarchy)


2025-04-16 12:16:53,674 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



LLM Response:
Certainly. Below, I present a **Refined Cluster Hierarchy** for your dataset. The key steps taken were:

- **Merging Highly Similar or Redundant Clusters**: Many cluster names described nearly the same content but with slight differences; I combined/standardized these as much as possible.
- **Standardizing Terminology**: I adopted more conventional or concise titles for major themes.
- **Condensing where Appropriate**: Some "leaf" clusters were merged if they appeared to be broken-up subtopics of a broader area.
- **Providing the Hierarchical Structure**: The refined hierarchy is explicitly presented as a tree.

---

# REFINED CLUSTER HIERARCHY

**Level 2 (Top Level):**

1. Neural Network Architectures, Training, and Efficiency  
2. Evaluation, Benchmarking, and Theoretical Analysis  
3. Learning Methodologies and Algorithms  
4. Robustness, Privacy, and Security  
5. Physical GRF Data Analysis and Experiments  
6. Specialized Applications (GANs, OOD, etc.)

---

### 1. 

2025-04-16 12:16:53,674 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"

**LLM Response:**

Certainly. Below, I present a **Refined Cluster Hierarchy** for your dataset. The key steps taken were:

- **Merging Highly Similar or Redundant Clusters**: Many cluster names described nearly the same content but with slight differences; I combined/standardized these as much as possible.
- **Standardizing Terminology**: I adopted more conventional or concise titles for major themes.
- **Condensing where Appropriate**: Some "leaf" clusters were merged if they appeared to be broken-up subtopics of a broader area.
- **Providing the Hierarchical Structure**: The refined hierarchy is explicitly presented as a tree.

---

# REFINED CLUSTER HIERARCHY

**Level 2 (Top Level):**

1. Neural Network Architectures, Training, and Efficiency  
2. Evaluation, Benchmarking, and Theoretical Analysis  
3. Learning Methodologies and Algorithms  
4. Robustness, Privacy, and Security  
5. Physical GRF Data Analysis and Experiments  
6. Specialized Applications (GANs, OOD, etc.)

---

### 1. Neural Network Architectures, Training, and Efficiency

#### 1.1 Architecture, Pruning, and Model Efficiency
- Neural Network Architectures and Methods  
- Pruning and Sparsity in Neural Networks  
- Neural Architecture Search and Weight Sharing  
- Distillation, Compression, and Model Simplification

#### 1.2 Training, Optimization, and Analysis Techniques
- Gradient-based Architecture Search and Optimization  
- Deep Neural Network Training Methods and LBI Approaches  
- Optimization Algorithms/Convergence in Stochastic Methods  
- Gradient Normalization and Variance Reduction

#### 1.3 Representation Learning and Feature Extraction
- Feature Extraction and Representation Methods  
- Audio Feature Analysis and Spectrogram Methods  
- Scheduled Sampling and Initialization  
- 3D Generative Models and VAEs  
- Prototype-based Attention for Classification  
- Language Modeling and Summarization (Enc-Dec)  
- Model Unification and Sequence Positioning (PDP)  
- Amharic (or Multilingual) Embeddings and Sentiment Analysis  
- Falcon and Quantization Techniques in PDF/Tree Models

---

### 2. Evaluation, Benchmarking, and Theoretical Analysis

#### 2.1 Evaluation of Learning Methods
- Evaluation and Benchmarking of Algorithms  
- Evaluation of Dropout/Data Augmentation/BatchNorm  
- OOD Data Imputation and Evaluation  
- Evaluation of Few-Shot/Self-Supervised Methods  
- Evaluation of Proposed Method Performance

#### 2.2 Theoretical Foundations and Analytical Methods
- Statistical Dependence Testing and Neuroimaging  
- Information Theory and Entropy Estimation  
- Lipschitz Analysis, Generalization, and Theoretical Bounds

---

### 3. Learning Methodologies and Algorithms

#### 3.1 Reinforcement & Sequential Decision Learning
- Reinforcement Learning and Policy Optimization  
- Reward Shaping and Intrinsic Motivation  
- Multi-Agent and Game-Theoretic RL  
- Exploration Strategies in RL

#### 3.2 Reinforcement Learning Algorithms and Stability Analysis
- Policy Learning and Decision Making in RL  
- Lyapunov-Based Stability in RL Algorithms

#### 3.3 Supervision & Labeling Methods
- Supervision and Weak Supervision in Training  
- Program Synthesis and Human-Labeled Datasets  
- Weakly Supervised Object Counting and Recognition

#### 3.4 Graph-Based Learning Methods
- Graph Neural Networks (GNNs) and Attention-based Pooling  
- Graph Community Detection  
- Graph Model Feature Analysis

#### 3.5 Similarity Search, Random Projection, and Model Search
- Similarity Search and Random Projection Methods

---

### 4. Robustness, Privacy, and Security

#### 4.1 Adversarial Robustness and Attack Defenses
- Adversarial Attacks and Robustness in ML  
- Backdoor Attacks and Defenses  
- Robustness and Differential Privacy  
- Invertible ODEs for Robustness and OOD Detection

---

### 5. Physical GRF Data Analysis and Experiments

- Methods and Experiments in GRF Data Analysis

---

### 6. Specialized Applications

- Underwater Image Super-Resolution with GANs  
- Other unique domains (if not fitting above)

---

## **NOTES on Mapping**

- **Many repetitive cluster names** (esp. for neural networks/convergence/optimization/etc) have been unified.
- **Falcon/Quantization/PDF Trees** merged into "Falcon and Quantization Techniques in PDF/Tree Models".
- **Distillation/Compression/Model Simplification** merged under one node.
- **Adversarial/Robustness/Privacy/Security** unified given strong thematic overlap.
- **GANs/Underwater Imaging/OOD-Related Works** placed under a "Specialized Application" branch for unique domains, unless they clearly fit elsewhere.

---

### **If you want to see how any particular original label or row maps to the refined cluster, let me know!**


## Cluster and Synthesize

In [None]:
# dfr_cs = pd.read_csv('reject_reviews_clustered.csv')
SYNTHESIS_NAMING_PROMPT_TEMPLATE = '''
you are an program committee chair of a scientific conference.
Your task is to identify the main issues in a list of reviews, and to group them into higher level groups.
This is an ENUMERATED list of reviews, in the format NUMBER: REVIEW. 
Your job is to reduce the list to a smaller number of ISSUEs raised by the reviewers. 
Be as specific as possible. 
The output is a list of GROUP_NAME that are at a higher level of abstaction and that group together at least two underlying issues.
Try to identify groups that put together a minimum of two, but ideally many similar issues, and give them a name that is descriptive of the group and of the differences between
items in that group and items in other groups. 
along with a mapping of the original ISSUE_NUMBERs to the new GROUP_NAME.
Only return the map if it contains more than one ISSUE_NUMBERs, that is, if we can actually group some issues into a higher level pattern.
Make sure every ISSUE_NUMBER is assigned to a group, and make an effort to group every issue into a higher level group.

These is the enumerated list of issues:
<ENUMERATED_ISSUE_LIST>
{{text_to_synthesize}}
</ENUMERATED_ISSUE_LIST>

First, briefly describe your reasoning, and then return a JSON dict object. The answer should have the following structure:
REASONING: <YOUR REASONING>
JSON GROUPS:
{
  "<GROUP_NAME_1>": ["<ISSUE_NUMBER>", .., "<ISSUE_NUMBER>"],
  "<GROUP_NAME_2>": ["<ISSUE_NUMBER>" .., "<ISSUE_NUMBER>"],
}
Please adhere to the format described in the JSON part of the answer: each json key must contain a list of numbers corresponding to the ISSUE_IDs to be grouped under that key identifying the group name.
Do not include special characters in the answer besides the JSON format.
'''




dfr_cs_frombase_nomap = df[:200].copy()
# dfr_cs_frombase_nomap['_level_0'] = dfr_cs_frombase_nomap['review'].astype(str)
dfr_cs_frombase_nomap.loc[:, '_level_0'] = dfr_cs_frombase_nomap['review'].astype(str)
dfr_cs_frombase_nomap = build_cluster_hierarchy(dfr_cs_frombase_nomap, llm, '_level_0', prompt_template = SYNTHESIS_NAMING_PROMPT_TEMPLATE, MAX_TOP_LEVEL_CLUSTERS=10)



Level _level_0 has 200 unique labels. When clustered, they become 9 clusters.
Processing cluster 1 of 9...


2025-04-16 00:53:53,386 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 2 of 9...


2025-04-16 00:54:02,055 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 3 of 9...


2025-04-16 00:54:08,833 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 4 of 9...


2025-04-16 00:54:18,157 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 5 of 9...


2025-04-16 00:54:26,322 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-16 00:54:34,488 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 6 of 9...
Processing cluster 7 of 9...


2025-04-16 00:54:43,324 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-16 00:54:52,050 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 8 of 9...
Processing cluster 9 of 9...


2025-04-16 00:55:00,525 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


We have gone from 200 clusters to 65 clusters.

Level _level_1 has 65 unique labels. When clustered, they become 4 clusters.
Processing cluster 1 of 4...


2025-04-16 00:55:12,491 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 2 of 4...


2025-04-16 00:55:16,808 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 3 of 4...


2025-04-16 00:55:20,697 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 4 of 4...


2025-04-16 00:55:27,303 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-16 00:55:32,130 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


We have gone from 65 clusters to 19 clusters.

Level _level_2 has 19 unique labels. When clustered, they become 2 clusters.
Processing cluster 1 of 2...


2025-04-16 00:55:36,945 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing cluster 2 of 2...


2025-04-16 00:55:45,454 - INFO - Final Level _level_2 has 19 clusters


Stopping synthesis at level _level_3 with 9 clusters.


In [None]:
print_clusters(dfr_cs_frombase_nomap, 'review', 'hierarchy_cs_nomap.csv')

Building Tree...
Detected and using levels (highest to lowest): ['_level_3', '_level_2', '_level_1', '_level_0']

Printing Tree Structure:
- Clarity and Presentation (notation, terminology, formatting, equation definition, and algorithm organization) (L3, 1)
  - Review:###The paper presents an approach to generating adversarial examples that preserve the semantics of the input examples. To do so, the approach reconstructs the manifold where the input examples lie and then generates new examples by perturbing the elements of the manifold so as to ensure the new elements remain in the manifold to preserve the semantics of the elements. In the presented system the manifold is learned by means of Stein Variational Gradient Descent, while the perturbation is made by applying the Gram-Schmidt process which ensures that the perturbed elements still reside in the manifold. To generate adversarial examples the approach presented in the paper considers a scenario in which only the predictions of

## map-reduce
this is the same as the map reduce case above, as indeed we map

# Refinement

Unnamed: 0,_level_0,_level_1,_level_2,_level_3
0,"This paper proposes Recency Bias, an adaptive...",Unclear or Problematic Comparison to Baselines,Empirical Validation and Baseline Comparison D...,"Empirical Evaluation, Validation, and Experime..."
1,Review:###This paper proposes an interesting h...,Specific Technical or Algorithmic Concerns,Clarity and Reproducibility of Technical and M...,"Clarity, Reproducibility, and Presentation of ..."
2,Review:###This paper explores a well motivated...,Lack of or Unclear Theoretical Justification/A...,Theoretical Foundation and Justification Issues,"Theoretical Foundation, Analysis, and Justific..."
3,Review:###This paper proposes a training strat...,Lack of Comprehensive Experiments/Empirical Ev...,Empirical Evaluation and Experimental Comparis...,"Empirical Evaluation, Validation, and Experime..."
4,Review:###This paper presents a training appro...,Lack of Thorough or Correctly Positioned Relat...,"Novelty, Distinction from Prior Work, and Cita...","Novelty, Prior Work Distinction, and Contextua..."
...,...,...,...,...
195,Review:###The paper paper proposes a mutual in...,Specific Technical or Algorithmic Concerns,Clarity and Reproducibility of Technical and M...,"Clarity, Reproducibility, and Presentation of ..."
196,Review:###This paper proposes a self-supervise...,Hyperparameter Selection and Implementation/Re...,Clarity and Reproducibility of Technical and M...,"Clarity, Reproducibility, and Presentation of ..."
197,I take issue with the usage of the phrase *sk...,Insufficient or Unconvincing Experimental Eval...,Empirical Evaluation and Experimental Comparis...,"Empirical Evaluation, Validation, and Experime..."
198,Review:###Although I assume somebody well-vers...,Confusing or incomplete modeling/notation,Clarity and Reproducibility of Technical and M...,"Clarity, Reproducibility, and Presentation of ..."


In [None]:
import pandas as pd
from jinja2 import Template

# Sample data representing the cluster hierarchy tree.

# Define a jinja2 template to create the prompt.
prompt_template = """
You are given a cluster hierarchy tree where each row represents a series of cluster names from the bottom level (leaf, _level_0) to higher levels (_level_1, _level_2, etc.). The current hierarchy is as follows:

{% for index, row in df.iterrows() %}
Row {{ index + 1 }}:
{% for level in levels %}
  {{ level }}: "{{ row[level] }}"
{% endfor %}
{% endfor %}

Please refine this cluster hierarchy. In your refined hierarchy, modify names if necessary and merge similar clusters (i.e. merge nodes in the tree that represent similar themes) while preserving the hierarchical structure.

Return the refined hierarchy in a clear, structured format.
"""
dfr_cs_frombase_nomap[[c for c in dfr_cs_frombase_nomap.columns if 'level_' in c]].drop_duplicates()
df_remap = dfr_cs_frombase_nomap[[c for c in dfr_cs_frombase_nomap.columns if 'level_' in c]] 
# List the hierarchy levels in order (assuming the DataFrame columns are ordered as the hierarchy).
levels = list(df_remap.columns)

# Render the prompt using the jinja2 template.
template = Template(prompt_template)
prompt = template.render(df=df_remap, levels=levels)

# # Log the prompt (for debugging purposes) before sending it to the LLM.
# print("Prompt sent to the LLM:")
# print(prompt)

# Call the LLM with the prompt.
# Assume that 'llm' is an instance of your LLM class with _call implemented.
refined_hierarchy = llm._call(prompt)

print("\nLLM Response:")
print(refined_hierarchy)


2025-04-16 01:41:02,359 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



LLM Response:
Certainly! Below is a **refined cluster hierarchy** based on the supplied data, with improved naming, removal of redundancy, and sensible merging of similar nodes/themes, all while preserving the hierarchical organization:

---

# Refined Cluster Hierarchy

## _Level 3: Top-Level Themes_

1. **Empirical Evaluation, Experimental Rigor, and Comparison**
2. **Clarity, Reproducibility, and Presentation**
3. **Novelty, Positioning, and Contextualization**
4. **Theoretical Foundation, Analysis, and Justification**
5. **Algorithmic/Technical Details, Problem Formulation, and Methodology**
6. **Review/Conference Policy and Paper Quality Issues**
7. **Specific Topical/Miscellaneous Issues**

---

## _Level 2: Merged and Renamed Subclusters_

### 1. Empirical Evaluation, Experimental Rigor, and Comparison
- Empirical Evaluation and Experimental Comparison
- Empirical Validation and Baseline Comparison Deficiencies
- Empirical Validation and Experimental Rigor
- Experimental Suffic

2025-04-16 01:41:02,359 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"

LLM Response:
Certainly! Below is a **refined cluster hierarchy** based on the supplied data, with improved naming, removal of redundancy, and sensible merging of similar nodes/themes, all while preserving the hierarchical organization:

---

# Refined Cluster Hierarchy

## _Level 3: Top-Level Themes_

1. **Empirical Evaluation, Experimental Rigor, and Comparison**
2. **Clarity, Reproducibility, and Presentation**
3. **Novelty, Positioning, and Contextualization**
4. **Theoretical Foundation, Analysis, and Justification**
5. **Algorithmic/Technical Details, Problem Formulation, and Methodology**
6. **Review/Conference Policy and Paper Quality Issues**
7. **Specific Topical/Miscellaneous Issues**

---

## _Level 2: Merged and Renamed Subclusters_

### 1. Empirical Evaluation, Experimental Rigor, and Comparison
- Empirical Evaluation and Experimental Comparison
- Empirical Validation and Baseline Comparison Deficiencies
- Empirical Validation and Experimental Rigor
- Experimental Sufficiency and Baseline Adequacy
- Metrics and Experimental Reporting Issues
- Hyperparameter, Tuning, and Methodology Concerns
- Ablation Studies and Component Contribution
- Generalizability, Scalability, or Efficiency Concerns
- Insufficient or Unconvincing Experimental Evidence or Analysis
- Issues with Baselines, Fair Comparisons, or Ablation Studies
- Insufficient Statistical Reporting or Unconvincing Quantitative Support

### 2. Clarity, Reproducibility, and Presentation
- Clarity, Presentation, and Exposition Issues
- Clarity, Writing, and Organization Problems
- Missing, Incomplete, or Vague Explanation/Notation
- Unclear Problem Formalization or Ambiguity in Explanation
- Clarity, Reproducibility, and Presentation of Methods
- Reproducibility and Implementation Details

### 3. Novelty, Positioning, and Contextualization
- Novelty, Distinction from Prior Work, and Citation Issues
- Lack of Novelty or Incremental Technical Contribution
- Overclaiming Novelty or Insufficient Distinction from Prior Work
- Lack of Context, Related Work, or Missing References
- Comparison to Prior or Related Work and Novelty Justification
- Citation, Positioning, and Surveying of Literature
- Dataset/Task/Protocol Novelty and Relevance

### 4. Theoretical Foundation, Analysis, and Justification
- Theoretical Foundation and Justification Issues
- Theoretical Analysis and Conceptual Clarity
- Unclear or Missing Theoretical Guarantees/Contribution
- Theoretical Issues and Insufficient Justification/Connection
- Misapplied Concepts or Flawed Assumptions

### 5. Algorithmic/Technical Details, Problem Formulation, and Methodology
- Clarity and Reproducibility of Technical and Methodological Details
- Specific Technical or Algorithmic Concerns
- Confusing or incomplete modeling/notation
- Empirical Evaluation, Sample Efficiency, and Comparisons in Meta-learning/Data Valuation
- Reward Formulation and Evaluation (for RL/IL papers)
- Problem Formalization and Methodological Justification or Conceptual Errors
- Presentation and Clarity of Algorithms and Experiments
- Implementation/Hyperparameter/Training Details

### 6. Review/Conference Policy and Paper Quality Issues
- Violation of Double Blind or Submission Guidelines
- Underlength, Poor Formatting, Non-anonymity
- Out-of-scope for Conference, Misfit Domain

### 7. Specific Topical/Miscellaneous Issues
- Mutual Information Usage and Edge Feature Integration in Graph Neural Networks
- Defense and Attack Training Schemes for Adversarial Robustness
- Discussion of Specific Models/Techniques (e.g., Recommender feedback formulations, Skill Discovery framing)
- Domain-specific or Task-specific Comments (as specialized sub-nodes as relevant)

---

## _Level 1: Further-Refined, Condensed Labels for Node Clarity_

Below, at Level 1, clusters are named for the reviewer focus and condensed where possible:

### For Level 2 Cluster: "Empirical Evaluation, Experimental Rigor, and Comparison"
- Experimental Evaluation & Comparative Analysis
- Validation/Ablation & Component Contribution
- Generalizability and Scalability of Approach

### For Level 2 Cluster: "Clarity, Reproducibility, and Presentation"
- Explanation and Notation Quality
- Writing Clarity & Organization
- Reproducibility, Method Specification & Implementation

### For Level 2 Cluster: "Novelty, Positioning, and Contextualization"
- Novelty & Distinction from Prior Art
- Prior Work Comparison & Contextual References
- Dataset/Task Selection & Justification

### For Level 2 Cluster: "Theoretical Foundation, Analysis, and Justification"
- Theoretical Assumptions and Guarantees
- Analysis, Proofs, and Conceptual Soundness
- Model/Conceptual Misapplication

### For Level 2 Cluster: "Algorithmic/Technical Details, Problem Formulation, and Methodology"
- Problem/Formalization and Technical Design
- Algorithm Specification & Methodological Details
- Training, Hyperparameter & Implementation Issues

### For Level 2 Cluster: "Review/Conference Policy and Paper Quality Issues"
- Submission and Review Policy Violations
- Paper Formatting, Length, and Scope Fit

### For Level 2 Cluster: "Specific Topical/Miscellaneous Issues"
- Topical Concerns: e.g., Mutual Information GNNs, Adversarial Defense Strategy, Skill Discovery, etc.
- Task/Domain-Specific Comments

---

## Example Pathways Through the Hierarchy

#### Example 1 (Empirical concerns for a label noise paper)
- Level 3: Empirical Evaluation, Experimental Rigor, and Comparison
  - Level 2: Empirical Validation and Experimental Comparison Issues
    - Level 1: Experimental Evaluation & Comparative Analysis

#### Example 2 (Clarity issues for a new skill discovery method)
- Level 3: Clarity, Reproducibility, and Presentation
  - Level 2: Clarity, Presentation, and Exposition Issues
    - Level 1: Writing Clarity & Organization

#### Example 3 (Novelty concerns for a new object detection approach)
- Level 3: Novelty, Positioning, and Contextualization
  - Level 2: Novelty, Distinction from Prior Work, and Citation Issues
    - Level 1: Novelty & Distinction from Prior Art

#### Example 4 (Theoretical analysis issues for a GNN paper)
- Level 3: Theoretical Foundation, Analysis, and Justification
  - Level 2: Theoretical Foundation and Justification Issues
    - Level 1: Theoretical Assumptions and Guarantees

---

## Commentary on Merge/Refinement

- "Empirical Evaluation, Validation, and Experimental Comparison" absorbs any issues about experiments, baselines, ablation, or comparative setup.
- "Clarity, Reproducibility, and Presentation" collects all concerns about how the paper is explained, written, or the completeness of method explanations.
- "Novelty, Prior Work Distinction, and Contextualization" merges lack of novelty/discussion/issues with missing related work/citations/contextual placement.
- "Theoretical Foundation, Analysis, and Justification" merges theoretical justification, conceptual issues, and lack of guarantees.
- "Algorithmic/Technical Details, Problem Formulation, and Methodology" incorporates concerns about technical correctness, formalization, or experiment/method details.
- "Review/Conference Policy and Paper Quality Issues" absorbs all aspects of submission or review guideline violation and paper format/scope fit.
- "Specific Topical/Miscellaneous Issues" is reserved for domain/task-specific issues or clusters not easily merged above.

---

**This refined structure should be both more readable and more helpful for downstream analysis or review.**

In [None]:


# Assume this variable holds the refined hierarchy produced by the LLM previously.
# Replace the triple-quoted string below with the actual refined hierarchy text.


# Create a jinja2 prompt template for the follow-up refinement.
prompt_template = """
The following is a refined cluster hierarchy based on an earlier task:

{{ refined_hierarchy }}

I would like to further refine this cluster hierarchy.
In particular, please modify the hierarchy by **separating out the "Ablation Studies" aspects as their own distinct category**.
That is, instead of having ablation studies merged with other experimental issues, please create a separate node (or nodes) for ablation studies.
Ensure that the overall hierarchical structure remains clear and consistent.

Return the updated refined cluster hierarchy in a clear, structured format.
"""

# Render the prompt using the jinja2 template
template = Template(prompt_template)
prompt = template.render(refined_hierarchy=refined_hierarchy)

print("Prompt sent to the LLM:")
print(prompt)

# Send the prompt to the LLM using its _call method.
# Assume 'llm' is an instance of your LLM class with _call implemented.
response = llm._call(prompt)

print("\nLLM Response:")
print(response)
