In [1]:
import re

STOPWORDS = ["a", "an", "the", "on", "in", "of", "at", "for", "to", "from", "with", "about"]

def clean_title(title: str) -> str:
    # Normalize and remove leading stopwords
    title = title.strip()
    # Use regex to remove ONLY at the start (case-insensitive)
    return re.sub(rf"^({'|'.join(STOPWORDS)})\s+", "", title, flags=re.IGNORECASE)

# Check

In [2]:
import bibtexparser

# Load the .bib file
with open("ref.bib", "r", encoding="utf-8") as bib_file:
    bib_database = bibtexparser.load(bib_file)

# Access all entries (as a list of dicts)
for entry in bib_database.entries:
    key = entry['ID']
    year = entry.get('year')
    title = entry.get('title')
    author = entry.get('author')

    prev = title
    while True:
        cleaned_title = clean_title(prev)
        if prev == cleaned_title:
            break
        prev = cleaned_title

    title_keyword = cleaned_title.split()[0].split('-')[0].split(':')[0].split(',')[0].split('.')[0].strip().lower()
    author_keyword = author.split(',')[0].split('.')[0].split(' ')[0].split('-')[0].strip().lower()

    reconsutructed_key = f"{author_keyword}{year}{title_keyword}"
    if key!=reconsutructed_key:
        # print(f"Key mismatch: {reconsutructed_key} should be {key}")
        # if not key.startswith(f"{author_keyword}{year}"):
        #     print(f"Key prefix mismatch: {key} should start with {author_keyword}{year}")
        #     # break
        if not key.endswith(f"{title_keyword}"):
            print(f"Key suffix mismatch: {key} should end with -----> {title_keyword}")
            break
        # break

# Duplicate removal

In [3]:
import bibtexparser

# Load the .bib file
with open("ref.bib", "r", encoding="utf-8") as bib_file:
    bib_database = bibtexparser.load(bib_file)

entries = bib_database.entries
print(f"Total entries before deduplication: {len(entries)}")

# Deduplicate based on 'ID'
seen_ids = set()
unique_entries = []
for entry in entries:
    if entry['ID'] not in seen_ids:
        unique_entries.append(entry)
        seen_ids.add(entry['ID'])

# Replace old entries with the deduplicated list
bib_database.entries = unique_entries

# Save the cleaned .bib file
with open("ref_cleaned.bib", "w", encoding="utf-8") as bib_file:
    bibtexparser.dump(bib_database, bib_file)

print(f"Total entries after deduplication: {len(unique_entries)}")

Total entries before deduplication: 158
Total entries after deduplication: 147


# New bib

In [5]:
import bibtexparser

# Load the .bib file
with open("export.bib", "r", encoding="utf-8") as bib_file:
    bib_database = bibtexparser.load(bib_file)

# Access all entries (as a list of dicts)
for entry in bib_database.entries:
    key = entry['ID']
    year = entry.get('year')
    title = entry.get('title')
    author = " and ".join([f"{i.split(' ')[-1]}, {" ".join(i.split(' ')[:-1])}" for i in entry.get('author').split(' and ')])
    prev = title
    while True:
        cleaned_title = clean_title(prev)
        if prev == cleaned_title:
            break
        prev = cleaned_title

    title_keyword = cleaned_title.split()[0].split('-')[0].split(':')[0].split(',')[0].split('.')[0].strip().lower()
    author_keyword = author.split(',')[0].split('.')[0].split(' ')[0].split('-')[0].strip().lower()

    reconsutructed_key = f"{author_keyword}{year}{title_keyword}"
    entry['author'] = author
    entry['ID'] = reconsutructed_key

with open("export_renamed.bib", "w", encoding="utf-8") as bib_file:
    bibtexparser.dump(bib_database, bib_file)


In [7]:
import bibtexparser

# Load the .bib file
with open("ref_unified.bib", "r", encoding="utf-8") as bib_file:
    bib_database = bibtexparser.load(bib_file)

entries = bib_database.entries
print(f"Total entries before deduplication: {len(entries)}")

# Deduplicate based on 'ID'
seen_ids = set()
unique_entries = []
for entry in entries:
    if entry['ID'] not in seen_ids:
        unique_entries.append(entry)
        seen_ids.add(entry['ID'])

# Replace old entries with the deduplicated list
bib_database.entries = unique_entries

# Save the cleaned .bib file
with open("ref_unified_cleaned.bib", "w", encoding="utf-8") as bib_file:
    bibtexparser.dump(bib_database, bib_file)

print(f"Total entries after deduplication: {len(unique_entries)}")

Total entries before deduplication: 355
Total entries after deduplication: 265
