In [1]:
import pandas as pd
import json
import re
import csv
import time
from tqdm import tqdm
from ollama import chat


input_file = '/home/ali/Review_Quality_Benchmark/data/processed/neurips2023_1000_papers.json'
# Load data
df = pd.read_json(input_file)

llm_fields = [
    "llm_Comprehensiveness", "llm_Vagueness", "llm_Objectivity", "llm_Fairness", "llm_Actionability", 
    "llm_Constructiveness", "llm_Relevance Alignment", "llm_Clarity and Readability", "llm_Usage of Technical Terms",
    "llm_Factuality", "llm_Overall Quality", "llm_overall_score_100", "llm_Sentiment Polarity", "llm_Politeness", 
]


# Check for missing fields and add them if not present
for field in llm_fields:
    if field not in df.columns:
        df[field] = pd.NA

# Pattern to extract JSON block
pattern = re.compile(r"<review_assessment>\s*(\{.*?\})\s*</review_assessment>", re.DOTALL)

# Define prompt template
template = """# REVIEW-QUALITY JUDGE

## 0 — ROLE

You are **ReviewInspector-LLM**, a rigorous, impartial meta-reviewer.
Your goal is to assess the quality of a single peer-review against a predefined set of criteria and to provide precise, structured evaluations.

## 1 — INPUTS

Title: {title}
Abstract: {abstract}
Review: {review_text}

## 2 — EVALUATION CRITERIA

Return **only** the scale value or label at right (no rationale text).

| #  | Criterion                    | Allowed scale / label                       | Description                                                                |
| -- | ---------------------------- | ------------------------------------------- | -------------------------------------------------------------------------- |
| 1  | **Comprehensiveness**        | integer **0-5**                             | Extent to which the review covers all key aspects of the paper.            |
| 2  | **Usage of Technical Terms** | integer **0-5**                             | Appropriateness and frequency of domain-specific vocabulary.               |
| 3  | **Factuality**               | **factual / partially factual / unfactual** | Accuracy of the statements made in the review.                             |
| 4  | **Sentiment Polarity**       | **negative / neutral / positive**           | Overall sentiment conveyed by the reviewer.                                |
| 5  | **Politeness**               | **polite / neutral / impolite**             | Tone and manner of the review language.                                    |
| 6  | **Vagueness**                | **none / low / moderate / high / extreme**  | Degree of ambiguity or lack of specificity in the review.                  |
| 7  | **Objectivity**              | integer **0-5**                             | Presence of unbiased, evidence-based commentary.                           |
| 8  | **Fairness**                 | integer **0-5**                             | Perceived impartiality and balance in judgments.                           |
| 9  | **Actionability**            | integer **0-5**                             | Helpfulness of the review in suggesting clear next steps.                  |
| 10 | **Constructiveness**         | integer **0-5**                             | Degree to which the review offers improvements rather than just criticism. |
| 11 | **Relevance Alignment**      | integer **0-5**                             | How well the review relates to the content and scope of the paper.         |
| 12 | **Clarity and Readability**  | integer **0-5**                             | Ease of understanding the review, including grammar and structure.         |
| 13 | **Overall Quality**          | integer **0-100**                           | Holistic evaluation of the review's usefulness and professionalism.        |

## 3 — SCORING GUIDELINES

For 0-5 scales:

* 5 = Outstanding
* 4 = Strong
* 3 = Adequate
* 2 = Weak
* 1 = Very weak
* 0 = Absent/irrelevant

## 4 — ANALYSIS & COMPUTATION (silent)

1. Read and understand the review in the context of the paper title and abstract.
2. Extract quantitative and qualitative signals (e.g., term usage, factual consistency, tone, clarity).
3. Map observations to the corresponding scoring scales.

## 5 — OUTPUT FORMAT (strict)  
Return **exactly one** JSON block wrapped in the tag below — **no comments or extra text**.

```json
<review_assessment>
{{
  "paper_title": "{title}",
  "criteria": {{
    "Comprehensiveness":       ...,
    "Usage of Technical Terms":   ...,
    "Factuality":    ...,
    "Sentiment Polarity":      ...,
    "Politeness":  ...,
    "Vagueness":          ...,
    "Objectivity":             ...,
    "Fairness":         ...,
    "Actionability":        ...,
    "Constructiveness":    ...,
    "Relevance Alignment":    ...,
    "Clarity and Readability":    ...,
    "Relevance Alignment":    ...,
    "Overall Quality":     ...
  }},
  "overall_score_100": ...
}}
</review_assessment>
```
"""

df

Unnamed: 0,submission_id,submission_number,submission_creation_date,submission_authors,submission_title,submission_abstract,reviewer,creation_date,last_modification_date,review_rating,...,llm_Actionability,llm_Constructiveness,llm_Relevance Alignment,llm_Clarity and Readability,llm_Usage of Technical Terms,llm_Factuality,llm_Overall Quality,llm_overall_score_100,llm_Sentiment Polarity,llm_Politeness
0,zyhxRc9bew,10819,1683789038840,"[~Hao_Sun1, ~Boris_van_Breugel2, ~Jonathan_Cra...",What is Flagged in Uncertainty Quantification?...,Uncertainty quantification (UQ) is essential f...,Reviewer_AvJq,1688368213177,1702411303415,6,...,,,,,,,,,,
1,zyhxRc9bew,10819,1683789038840,"[~Hao_Sun1, ~Boris_van_Breugel2, ~Jonathan_Cra...",What is Flagged in Uncertainty Quantification?...,Uncertainty quantification (UQ) is essential f...,Reviewer_7E4k,1688505633161,1702411303319,6,...,,,,,,,,,,
2,zyhxRc9bew,10819,1683789038840,"[~Hao_Sun1, ~Boris_van_Breugel2, ~Jonathan_Cra...",What is Flagged in Uncertainty Quantification?...,Uncertainty quantification (UQ) is essential f...,Reviewer_hHZH,1688552936677,1702411303221,6,...,,,,,,,,,,
3,zyhxRc9bew,10819,1683789038840,"[~Hao_Sun1, ~Boris_van_Breugel2, ~Jonathan_Cra...",What is Flagged in Uncertainty Quantification?...,Uncertainty quantification (UQ) is essential f...,Reviewer_sVgx,1688657604892,1702411303144,6,...,,,,,,,,,,
4,zsOOqjaj2z,5668,1683686700553,"[~Yuanyuan_Wang5, ~Xi_Geng1, ~Wei_Huang8, ~Biw...",Generator Identification for Linear SDEs with ...,"In this paper, we present conditions for ident...",Reviewer_Huoq,1688427159321,1702411021240,7,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4460,00EKYYu3fD,6838,1683719044332,"[~Tianyang_Hu1, ~Fei_Chen8, ~Haonan_Wang1, ~Ji...",Complexity Matters: Rethinking the Latent Spac...,"In generative modeling, numerous successful ap...",Reviewer_ssMC,1688676041356,1702411081106,6,...,,,,,,,,,,
4461,009LK0vLcY,10107,1683778565991,"[~Mehrdad_Ghadiri2, ~David_Arbour1, ~Tung_Mai1...",Finite Population Regression Adjustment and No...,The design and analysis of randomized experime...,Reviewer_UdXn,1688449656890,1702411268900,7,...,,,,,,,,,,
4462,009LK0vLcY,10107,1683778565991,"[~Mehrdad_Ghadiri2, ~David_Arbour1, ~Tung_Mai1...",Finite Population Regression Adjustment and No...,The design and analysis of randomized experime...,Reviewer_ZgSr,1688485585833,1702411268818,5,...,,,,,,,,,,
4463,009LK0vLcY,10107,1683778565991,"[~Mehrdad_Ghadiri2, ~David_Arbour1, ~Tung_Mai1...",Finite Population Regression Adjustment and No...,The design and analysis of randomized experime...,Reviewer_HGb9,1688665406904,1702411268706,4,...,,,,,,,,,,


In [2]:
# Process each row
# Set the temperature parameter for the llama model
temperature = 0
seed = 42


# Process each row
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scoring with LLM"):
    # Skip if all llm fields are already filled
    if all(pd.notna(row.get(field, pd.NA)) for field in llm_fields):
        continue
    if idx >= 50:
        break

    prompt = template.format(
        title=row['submission_title'],
        abstract=row['submission_abstract'],
        review_text=row['total_review']
    )
    
    for attempt in range(5):
        try:
            response = chat("llama3:8b", messages=[{'role': 'user', 'content': prompt}], options={'temperature': temperature, 'seed': seed})
            content = response['message']['content']
            match = pattern.search(content)
            if not match:
                raise ValueError("No JSON block found")

            parsed = json.loads(match.group(1))
            print(parsed["overall_score_100"])
            for key, val in parsed["criteria"].items():
                df.at[idx, f"llm_{key}"] = val
            df.at[idx, "llm_overall_score_100"] = parsed["overall_score_100"]

            # Save after every successful row
            # df.to_csv(input_file, index=False, quoting=csv.QUOTE_ALL)
            break

        except Exception as e:
            print(f"❌ Error at row {idx}, attempt {attempt + 1}: {e}")

Scoring with LLM:   0%|          | 0/4465 [00:00<?, ?it/s]

❌ Error at row 0, attempt 1: Expecting value: line 6 column 22 (char 231)


❌ Error at row 0, attempt 2: Expecting value: line 6 column 22 (char 231)


❌ Error at row 0, attempt 3: Expecting value: line 6 column 22 (char 231)


❌ Error at row 0, attempt 4: Expecting value: line 6 column 22 (char 231)


Scoring with LLM:   0%|          | 1/4465 [00:16<20:43:10, 16.71s/it]

❌ Error at row 0, attempt 5: Expecting value: line 6 column 22 (char 231)


❌ Error at row 1, attempt 1: Expecting value: line 6 column 22 (char 231)


Scoring with LLM:   0%|          | 2/4465 [00:24<14:12:54, 11.47s/it]

80


❌ Error at row 2, attempt 1: Expecting value: line 6 column 22 (char 231)


❌ Error at row 2, attempt 2: Expecting value: line 6 column 22 (char 231)


❌ Error at row 2, attempt 3: Expecting value: line 6 column 22 (char 231)


❌ Error at row 2, attempt 4: Expecting value: line 6 column 22 (char 231)


Scoring with LLM:   0%|          | 3/4465 [00:42<18:02:34, 14.56s/it]

❌ Error at row 2, attempt 5: Expecting value: line 6 column 22 (char 231)


Scoring with LLM:   0%|          | 4/4465 [00:46<12:43:00, 10.26s/it]

80


Scoring with LLM:   0%|          | 5/4465 [00:50<9:44:57,  7.87s/it] 

70


Scoring with LLM:   0%|          | 6/4465 [00:53<7:58:09,  6.43s/it]

60


Scoring with LLM:   0%|          | 7/4465 [00:57<6:51:02,  5.53s/it]

60


Scoring with LLM:   0%|          | 8/4465 [01:00<6:01:41,  4.87s/it]

60


Scoring with LLM:   0%|          | 9/4465 [01:05<5:45:46,  4.66s/it]

70


Scoring with LLM:   0%|          | 10/4465 [01:08<5:20:04,  4.31s/it]

70


❌ Error at row 10, attempt 1: Expecting value: line 6 column 22 (char 187)


❌ Error at row 10, attempt 2: Expecting value: line 6 column 22 (char 187)


❌ Error at row 10, attempt 3: Expecting value: line 6 column 22 (char 187)


Scoring with LLM:   0%|          | 11/4465 [01:22<9:03:53,  7.33s/it]

80


Scoring with LLM:   0%|          | 12/4465 [01:26<7:46:13,  6.28s/it]

80


Scoring with LLM:   0%|          | 13/4465 [01:30<6:51:21,  5.54s/it]

80


Scoring with LLM:   0%|          | 14/4465 [01:34<6:28:33,  5.24s/it]

80


Scoring with LLM:   0%|          | 15/4465 [01:38<5:55:48,  4.80s/it]

70


Scoring with LLM:   0%|          | 16/4465 [01:42<5:34:01,  4.50s/it]

80


Scoring with LLM:   0%|          | 17/4465 [01:46<5:18:29,  4.30s/it]

80


Scoring with LLM:   0%|          | 18/4465 [01:50<5:15:21,  4.25s/it]

80


Scoring with LLM:   0%|          | 19/4465 [01:55<5:19:57,  4.32s/it]

80


Scoring with LLM:   0%|          | 20/4465 [01:58<5:05:27,  4.12s/it]

70


Scoring with LLM:   0%|          | 21/4465 [02:04<5:48:15,  4.70s/it]

60


❌ Error at row 21, attempt 1: Expecting value: line 6 column 22 (char 206)


Scoring with LLM:   0%|          | 22/4465 [02:12<6:51:23,  5.56s/it]

80


Scoring with LLM:   1%|          | 23/4465 [02:15<6:03:02,  4.90s/it]

70


Scoring with LLM:   1%|          | 24/4465 [02:19<5:34:00,  4.51s/it]

70


Scoring with LLM:   1%|          | 25/4465 [02:22<5:09:16,  4.18s/it]

80


Scoring with LLM:   1%|          | 26/4465 [02:26<4:53:55,  3.97s/it]

70


Scoring with LLM:   1%|          | 27/4465 [02:28<4:25:43,  3.59s/it]

70


❌ Error at row 27, attempt 1: Expecting value: line 6 column 22 (char 217)


❌ Error at row 27, attempt 2: Expecting value: line 6 column 22 (char 217)


❌ Error at row 27, attempt 3: Expecting value: line 6 column 22 (char 217)


❌ Error at row 27, attempt 4: Expecting value: line 6 column 22 (char 217)


Scoring with LLM:   1%|          | 28/4465 [02:43<8:36:28,  6.98s/it]

❌ Error at row 27, attempt 5: Expecting value: line 6 column 22 (char 217)


Scoring with LLM:   1%|          | 29/4465 [02:47<7:21:04,  5.97s/it]

80


Scoring with LLM:   1%|          | 30/4465 [02:51<6:30:55,  5.29s/it]

80


Scoring with LLM:   1%|          | 31/4465 [02:53<5:38:22,  4.58s/it]

70


Scoring with LLM:   1%|          | 32/4465 [02:56<4:59:59,  4.06s/it]

20


Scoring with LLM:   1%|          | 33/4465 [02:59<4:38:15,  3.77s/it]

70


Scoring with LLM:   1%|          | 34/4465 [03:03<4:23:22,  3.57s/it]

80


Scoring with LLM:   1%|          | 35/4465 [03:06<4:23:04,  3.56s/it]

80


Scoring with LLM:   1%|          | 36/4465 [03:09<4:09:03,  3.37s/it]

85


Scoring with LLM:   1%|          | 37/4465 [03:12<4:08:47,  3.37s/it]

80


Scoring with LLM:   1%|          | 38/4465 [03:15<3:59:20,  3.24s/it]

80


Scoring with LLM:   1%|          | 39/4465 [03:19<4:01:08,  3.27s/it]

80


Scoring with LLM:   1%|          | 40/4465 [03:22<4:02:35,  3.29s/it]

80


Scoring with LLM:   1%|          | 41/4465 [03:25<4:05:25,  3.33s/it]

70


Scoring with LLM:   1%|          | 42/4465 [03:29<4:13:02,  3.43s/it]

70


Scoring with LLM:   1%|          | 43/4465 [03:33<4:14:35,  3.45s/it]

70


Scoring with LLM:   1%|          | 44/4465 [03:36<4:14:00,  3.45s/it]

80


Scoring with LLM:   1%|          | 45/4465 [03:39<4:13:00,  3.43s/it]

70


Scoring with LLM:   1%|          | 46/4465 [03:43<4:10:40,  3.40s/it]

In [None]:
df.to_json('/home/ali/Review_Quality_Benchmark/data/processed/neurips2023_1000papers_llm.json', orient='records')

In [None]:
# Filter the first 50 rows and check for rows without null values in "llm_" columns
non_null_count = df.iloc[:][[col for col in df.columns if col.startswith("llm_")]].dropna().shape[0]
print(non_null_count)
print(df.shape)

In [None]:
df