In [18]:
# Paths to your two BibTeX files
old_bib_path = 'malpedia-db_2022-07-18.bib'
new_bib_path = 'malpedia-db_2024-11-12.bib'


In [19]:
def write_urls_to_file(urls, file_path):
    """
    Writes a list of URLs to a file.
    
    Args:
        urls (list): List of URLs to write.
        file_path (str): Output file path.
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        for url in urls:
            file.write(url + '\n')

In [21]:
import bibtexparser

def replace_online_with_misc(content):
    """Replace @online entries with @misc to ensure compatibility."""
    return content.replace('@online', '@misc')

def load_bib_file(file_path):
    """
    Loads a .bib file and parses it into a dictionary.
    
    Args:
        file_path (str): Path to the .bib file.
        
    Returns:
        list: A list of parsed BibTeX entries.
    """
    with open(file_path, 'r', encoding='utf-8') as bib_file:
        content = bib_file.read()
        modified_content = replace_online_with_misc(content)
        
        parser = bibtexparser.bparser.BibTexParser(common_strings=True)
        bib_database = bibtexparser.loads(modified_content, parser=parser)
        return bib_database.entries

def extract_new_entries(old_entries, new_entries):
    """
    Compares two sets of BibTeX entries and finds new ones.
    
    Args:
        old_entries (list): List of entries from the old BibTeX file.
        new_entries (list): List of entries from the new BibTeX file.
        
    Returns:
        list: A list of new BibTeX entries found only in the new file.
    """
    old_keys = {entry['ID'] for entry in old_entries}  # Get all IDs from old entries
    new_entries_only = [entry for entry in new_entries if entry['ID'] not in old_keys]
    return new_entries_only

def extract_urls_from_entries(entries):
    """
    Extracts URLs from a list of BibTeX entries.
    
    Args:
        entries (list): List of BibTeX entries.
        
    Returns:
        list: A list of URLs found in the entries.
    """
    urls = [entry['url'] for entry in entries if 'url' in entry]
    return urls


old_entries = load_bib_file(old_bib_path)  # Load 2022 entries
new_entries = load_bib_file(new_bib_path)  # Load 2024 entries

# Find new entries in the 2024 BibTeX file
new_entries_only = extract_new_entries(old_entries, new_entries)
print(len(new_entries_only))
# Extract URLs from the new entries
new_urls = extract_urls_from_entries(new_entries_only)
print(len(new_urls))
# Output the new URLs
#print(f"New URLs extracted: {new_urls}")

output_file_path = "Malpedia_download_urls.txt"
# Write the new URLs to a file
write_urls_to_file(new_urls, output_file_path)

print(f"New URLs have been written to {output_file_path}")

# Optionally, you can save these new entries to another file or process them further
#for entry in new_entries_only:
#    print(f"New Entry: {entry['ID']}, URL: {entry.get('url', 'No URL')}")


3959
3959
New URLs have been written to Malpedia_download_urls.txt
