In [2]:
import bibtexparser
from collections import defaultdict

def load_bib_file(filename):
    with open(filename) as bibtex_file:
        return bibtexparser.load(bibtex_file)

def save_bib_file(filename, entries):
    db = bibtexparser.bibdatabase.BibDatabase()
    db.entries = entries
    writer = bibtexparser.bwriter.BibTexWriter()
    with open(filename, 'w') as bibtex_file:
        bibtex_file.write(writer.write(db))

def entry_to_bibtex(entry):
    db = bibtexparser.bibdatabase.BibDatabase()
    db.entries = [entry]
    writer = bibtexparser.bwriter.BibTexWriter()
    return writer.write(db)

def find_duplicate_entries(bib_database):
    id2entries = defaultdict(set)
    for entry in bib_database.entries:
        id2entries[entry['ID']].add(entry_to_bibtex(entry))
    return {ID: entries for ID, entries in id2entries.items() if len(entries) > 1}

def check_deduplication_integrity(original_entries, dedup_entries):
    missing_ids = set(original_entries.keys()) - set(dedup_entries.keys())
    new_ids = set(dedup_entries.keys()) - set(original_entries.keys())
    
    if missing_ids:
        print(f"Error: The following IDs are missing from the deduplicated set: {missing_ids}")
    else:
        print("All duplicate element IDs are present in the deduplicated set.")
    
    if new_ids:
        print(f"Error: The following IDs are new in the deduplicated set: {new_ids}")
    else:
        print("All deduplicated IDs are present in the original set.")

    for dedup_id, dedup_entry in dedup_entries.items():
        if dedup_id in original_entries and dedup_entry not in original_entries[dedup_id]:
            print(f"Error: Deduplicated entry for ID '{dedup_id}' does not match any original entry.")

    print("Integrity check completed.")

def merge_entries(original_db, dedup_db):
    merged_entries = []
    dedup_ids = set(entry['ID'] for entry in dedup_db.entries)
    
    for entry in original_db.entries:
        if entry['ID'] not in dedup_ids:
            merged_entries.append(entry)
    
    merged_entries.extend(dedup_db.entries)
    return merged_entries

# Main execution
def main():
    original_file = 'refs_merged.bib'
    duplicate_file = 'duplicate_entries.bib'
    deduplicated_file = 'deduplicated_entries.bib'
    merged_file = 'refs_merged_dedup.bib'

    # Load original database
    original_db = load_bib_file(original_file)

    # Find and save duplicate entries
    duplicate_entries = find_duplicate_entries(original_db)
    save_bib_file(duplicate_file, [entry for entries in duplicate_entries.values() for entry in entries])

    # Load deduplicated entries (assuming this file has been created by your ML/NLP process)
    dedup_db = load_bib_file(deduplicated_file)

    # Convert entries to BibTeX format for integrity check
    original_entries = {ID: set(entry_to_bibtex(entry) for entry in original_db.entries if ID == entry["ID"]) for ID in duplicate_entries}
    dedup_entries = {entry['ID']: entry_to_bibtex(entry) for entry in dedup_db.entries}

    # Perform integrity check
    check_deduplication_integrity(original_entries, dedup_entries)

    # Merge original and deduplicated entries
    merged_entries = merge_entries(original_db, dedup_db)

    # Save merged entries
    save_bib_file(merged_file, merged_entries)

    print(f"Merged entries saved to {merged_file}")

In [6]:
import os
import bibtexparser

# List of BibTeX IDs to search for
bibtex_ids = [
   "latouche1999introduction", "braun2010variational","hazra2019det" 
]

def search_bib_files():
    for ID in bibtex_ids:
        for filename in os.listdir('.'):
            if '.bib' in filename:
                with open(filename, 'r', encoding='utf-8') as bibtex_file:
                    bib_database = bibtexparser.load(bibtex_file)
                    
                    for entry in bib_database.entries:
                        if entry['ID'] == ID:
                            print(entry)
                            print(entry_to_bibtex(entry))
                            # print("-" * 50)

if __name__ == "__main__":
    search_bib_files()

{'publisher': 'SIAM', 'year': '1999', 'author': 'Latouche, Guy and Ramaswami, Vaidyanathan', 'title': 'Introduction to matrix analytic methods in stochastic modeling', 'ENTRYTYPE': 'book', 'ID': 'latouche1999introduction'}
@book{latouche1999introduction,
 author = {Latouche, Guy and Ramaswami, Vaidyanathan},
 publisher = {SIAM},
 title = {Introduction to matrix analytic methods in stochastic modeling},
 year = {1999}
}

{'year': '1999', 'title': 'Introduction to matrix analytic methods in stochastic modeling', 'publisher': 'SIAM', 'author': 'Latouche, Guy and Ramaswami, Vaidyanathan', 'ENTRYTYPE': 'book', 'ID': 'latouche1999introduction'}
@book{latouche1999introduction,
 author = {Latouche, Guy and Ramaswami, Vaidyanathan},
 publisher = {SIAM},
 title = {Introduction to matrix analytic methods in stochastic modeling},
 year = {1999}
}



Entry type control not standard. Not considered.


{'year': '1999', 'title': 'Introduction to matrix analytic methods in stochastic modeling', 'publisher': 'SIAM', 'author': 'Latouche, Guy and Ramaswami, Vaidyanathan', 'ENTRYTYPE': 'book', 'ID': 'latouche1999introduction'}
@book{latouche1999introduction,
 author = {Latouche, Guy and Ramaswami, Vaidyanathan},
 publisher = {SIAM},
 title = {Introduction to matrix analytic methods in stochastic modeling},
 year = {1999}
}

{'publisher': 'Taylor \\& Francis', 'year': '2010', 'pages': '324--335', 'number': '489', 'volume': '105', 'journal': 'Journal of the American Statistical Association', 'author': 'Braun, Michael and McAuliffe, Jon', 'title': 'Variational inference for large-scale models of discrete choice', 'ENTRYTYPE': 'article', 'ID': 'braun2010variational'}
@article{braun2010variational,
 author = {Braun, Michael and McAuliffe, Jon},
 journal = {Journal of the American Statistical Association},
 number = {489},
 pages = {324--335},
 publisher = {Taylor \& Francis},
 title = {Variatio

Entry type control not standard. Not considered.


{'year': '2010', 'volume': '105', 'title': 'Variational inference for large-scale models of discrete choice', 'publisher': 'Taylor \\& Francis', 'pages': '324--335', 'number': '489', 'journal': 'Journal of the American Statistical Association', 'author': 'Braun, Michael and McAuliffe, Jon', 'ENTRYTYPE': 'article', 'ID': 'braun2010variational'}
@article{braun2010variational,
 author = {Braun, Michael and McAuliffe, Jon},
 journal = {Journal of the American Statistical Association},
 number = {489},
 pages = {324--335},
 publisher = {Taylor \& Francis},
 title = {Variational inference for large-scale models of discrete choice},
 volume = {105},
 year = {2010}
}

{'year': '2010', 'volume': '105', 'title': 'Variational inference for large-scale models of discrete choice', 'publisher': 'Taylor \\& Francis', 'pages': '324--335', 'number': '489', 'journal': 'Journal of the American Statistical Association', 'author': 'Braun, Michael and McAuliffe, Jon', 'ENTRYTYPE': 'article', 'ID': 'braun201

Entry type control not standard. Not considered.


{'year': '2019', 'booktitle': 'Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics', 'author': 'Hazra, Sayan and Banerjee, Soumi and Ghosh, Kripabandhu and Ghosh, Saptarshi and Mehta, Parth', 'title': 'A DET for Natural Language Inference: Generating Natural Language Inference Explanation Trees using Deep Generative Models', 'ENTRYTYPE': 'inproceedings', 'ID': 'hazra2019det'}
@inproceedings{hazra2019det,
 author = {Hazra, Sayan and Banerjee, Soumi and Ghosh, Kripabandhu and Ghosh, Saptarshi and Mehta, Parth},
 booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
 title = {A DET for Natural Language Inference: Generating Natural Language Inference Explanation Trees using Deep Generative Models},
 year = {2019}
}

{'year': '2019', 'title': 'A DET for Natural Language Inference: Generating Natural Language Inference Explanation Trees using Deep Generative Models', 'booktitle': 'Proceedings of the 57th An