In [17]:
import os
import re
import json
from bs4 import BeautifulSoup
from collections import defaultdict
from collections import Counter

In [18]:
def get_download_files(folder_paths):
    """Recursively find .download files in given folders."""
    download_files = []
    for folder in folder_paths:
        for root, _, files in os.walk(folder):
            for file in files:
                if file.endswith('.download'):
                    download_files.append(os.path.join(root, file))
    return download_files

def extract_html_title(file_path):
    """Extract the <title> from HTML content. Return None if unreadable."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
        title_tag = soup.find('title')
        return title_tag.get_text(strip=True) if title_tag else ''
    except UnicodeDecodeError:
        return None
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None


def normalize_title(title):
    """Normalize title text for comparison."""
    title = title.lower()
    title = re.sub(r'\s+', ' ', title)
    return title.strip()

def find_duplicate_titles(file_paths):
    """Build map of normalized title -> list of file paths."""
    title_map = defaultdict(list)
    skipped_files = []

    for path in file_paths:
        title = extract_html_title(path)
        if title is None:
            skipped_files.append(path)
            continue

        normalized = normalize_title(title)
        if normalized:
            title_map[normalized].append({
                'original_title': title,
                'file_path': path
            })

    duplicates = {title: items for title, items in title_map.items() if len(items) > 1}
    return duplicates, skipped_files


def write_json_report(duplicates, output_file='duplicate_titles_report.json'):
    """Write duplicate report to a JSON file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(duplicates, f, indent=2, ensure_ascii=False)
    
    print(f"\nJSON report saved to {output_file}")

def main():

    folder_paths = [
        r'C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\documents',
        r'C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\20241204_malpedia_downloads',
        #r'C:\Users\ricewater\Documents\CTIDownloads\downloads\downloads\20241008_downloads'
    ]

    download_files = get_download_files(folder_paths)
    print(f"Found {len(download_files)} .download files...")

    duplicates, skipped = find_duplicate_titles(download_files)

    print(f"\n Parsed HTML files: {len(download_files) - len(skipped)}")
    print(f" Skipped non-HTML/binary files: {len(skipped)}")
    print(f" Duplicate title groups found: {len(duplicates)}")

    write_json_report(duplicates)

    # Optional: log skipped files
    with open('skipped_non_html_files.json', 'w', encoding='utf-8') as f:
        json.dump(skipped, f, indent=2)
    print("Skipped files saved to skipped_non_html_files.json")



if __name__ == '__main__':
    main()


Found 14983 .download files...

 Parsed HTML files: 13879
 Skipped non-HTML/binary files: 1104
 Duplicate title groups found: 362

JSON report saved to duplicate_titles_report.json
Skipped files saved to skipped_non_html_files.json


In [19]:
## For MITRE dataset

Found 920 .download files...

 Parsed HTML files: 782
 Skipped non-HTML/binary files: 138
 Duplicate title groups found: 122

##For Malpedia dataset (part of it)

Found 3620 .download files...

 Parsed HTML files: 3428
 Skipped non-HTML/binary files: 192
 Duplicate title groups found: 94


##For Malpedia Dataset complete
Found 14983 .download files...

 Parsed HTML files: 13879
 Skipped non-HTML/binary files: 1104
 Duplicate title groups found: 362




SyntaxError: invalid syntax (3741036836.py, line 3)

In [20]:
def read_bib_file(file_path):
    """
    Reads the content of a .bib file.

    Parameters
    ----------
    file_path : str
        Path to the .bib file.

    Returns
    -------
    str
        Contents of the file as a single string.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def find_duplicate_titles(bib_text):
    """
    Identifies duplicate titles in a BibTeX string.

    Parameters
    ----------
    bib_text : str
        Contents of the .bib file.

    Returns
    -------
    dict
        A dictionary with duplicate titles and their counts.
    """
    # Extract all 'title = {...}' entries (case-insensitive)
    titles = re.findall(r"title\s*=\s*{{([^}]+)}}", bib_text, flags=re.IGNORECASE)
    
    # Normalize titles (strip, lowercase)
    normalized_titles = [title.strip().lower() for title in titles]
    
    # Count occurrences
    title_counts = Counter(normalized_titles)
    
    # Filter duplicates
    duplicates = {title: count for title, count in title_counts.items() if count > 1}
    
    return duplicates

In [21]:
# Example usage
file_path = r"C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\malpedia-db_2024-11-12.bib"
bib_content = read_bib_file(file_path)
duplicate_titles = find_duplicate_titles(bib_content)

print("Duplicate titles and their counts:")

Duplicate titles and their counts:


In [22]:
len(duplicate_titles)

463

In [23]:
for title, count in duplicate_titles.items():
   print(f"{title} - {count}")

apt-k-47 organization launches espionage attacks using a new trojan tool - 2
tracking subaat: targeted phishing attack leads to threat actor’s repository - 2
clean ursa - 2
granite taurus - 2
trident ursa - 2
diplomats beware: cloaked ursa phishing with a twist - 2
fin7 power hour: adversary archaeology and the evolution of fin7 - 2
feodo tracker - 2
biotech research firm miltenyi biotec hit by ransomware, data leaked - 2
privileges and credentials: phished at the request of counsel - 2
kimsuky - 2
lookout uncovers android spyware deployed in kazakhstan - 2
threat actor targeting hong kong pro-democracy figures - 2
deep analysis agent tesla malware - 2
an analysis of infrastructure linked to the hagga threat actor - 3
cyber deterrence in action? a story of one long hurricane panda campaign - 2
ani-shell - 2
icedid analysis - 2
operation bitter biscuit - 2
rancor: targeted attacks in south east asia using plaintee and ddkong malware families - 2
axiom - 3
lazarus group - 2
putter panda 