In [2]:
import re
import pandas as pd

with open("articles.ris", "r", encoding="utf-8") as f:
    content = f.read()

entries = content.strip().split("ER  -")

def extract_field(entry, tag):
    match = re.search(rf"{tag}\s+-\s+(.*)", entry)
    return match.group(1).strip() if match else None

parsed_entries = []

for entry in entries:
    if not entry.strip():
        continue
    rayyan_id = extract_field(entry, "AN")
    title = extract_field(entry, "TI")
    abstract = extract_field(entry, "AB")
    doi = extract_field(entry, "DO") 
    n1 = extract_field(entry, "N1")

    n1_binary = 1 if n1 and 'Included' in n1 else 0  

    parsed_entries.append({
        "rayyan_id": rayyan_id,
        "title": title,
        "abstract": abstract,
        "DOI": doi,
        "label": n1_binary
    })

df1 = pd.DataFrame(parsed_entries)

# Replace NaN or empty entries with "<Missing>"
df1 = df1.replace(r'^\s*$', pd.NA, regex=True).fillna("<Missing>")

# Save to Excel
df1.to_excel("parsed_articles.xlsx", index=False)