In [3]:
import os

def remove_duplicates_from_file(input_filepath, output_filepath):
    """
    Reads a file, removes duplicate lines, and writes the unique lines
    to a new file.

    Args:
        input_filepath (str): The path to the input file.
        output_filepath (str): The path to the output file.
    """
    # Use a set to store unique lines. A set automatically handles uniqueness.
    unique_lines = set()

    # Read the input file and add each line to the set.
    try:
        with open(input_filepath, 'r', encoding='utf-8') as infile:
            for line in infile:
                # Use .strip() to remove leading/trailing whitespace and the newline character
                # so that "Alavi2016a" and "Alavi2016a " aren't considered different.
                cleaned_line = line.strip()
                if cleaned_line: # Ensure we don't add empty lines if they exist
                    unique_lines.add(cleaned_line)
        
    except FileNotFoundError:
        print(f"Error: The input file '{input_filepath}' was not found.")
        return
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return

    # Sort the unique lines alphabetically before writing them to the output file.
    # This is an optional step for a cleaner output.
    sorted_unique_lines = sorted(list(unique_lines))

    # Write the unique lines to the new output file.
    try:
        with open(output_filepath, 'w', encoding='utf-8') as outfile:
            for line in sorted_unique_lines:
                outfile.write(line + '\n')
        
        print(f"Successfully removed duplicates. Unique lines saved to '{output_filepath}'.")

    except Exception as e:
        print(f"An error occurred while writing to the output file: {e}")

if __name__ == "__main__":
    # Define the input and output file paths.
    input_file = "citation_keys.txt"
    output_file = "unique_citation_keys.txt"
    
    # Run the function to process the file.
    remove_duplicates_from_file(input_file, output_file)


Successfully removed duplicates. Unique lines saved to 'unique_citation_keys.txt'.


In [2]:
# remove_duplicates.py

def remove_duplicates(input_file, output_file):
    # Read all lines from input file
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Strip whitespace/newlines and remove duplicates using set
    unique_lines = list(set(line.strip() for line in lines if line.strip()))

    # Sort them (optional – remove if you want original order preserved)
    unique_lines.sort()

    # Write unique lines to output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in unique_lines:
            f.write(line + '\n')

if __name__ == "__main__":
    remove_duplicates("citation_keys.txt", "unique_citation_keysGPT.txt")


In [4]:
# compare_citations.py

def find_removed(collection_file, used_file, removed_output):
    # Read all citations from the collection
    with open(collection_file, 'r', encoding='utf-8') as f:
        all_citations = [line.strip() for line in f if line.strip()]

    # Read used citations into a set for fast lookup
    with open(used_file, 'r', encoding='utf-8') as f:
        used_citations = {line.strip() for line in f if line.strip()}

    # Find removed citations (keep same order as collection file)
    removed = [c for c in all_citations if c not in used_citations]

    # Save plain text list of removed citations
    with open(removed_output, 'w', encoding='utf-8') as f:
        for citation in removed:
            f.write(citation + "\n")

    print(f"✅ Removed citations saved to: {removed_output}")

if __name__ == "__main__":
    find_removed(
        "citation_keys.txt",
        "used_citations.txt",
        "removed_citations.txt"
    )


✅ Removed citations saved to: removed_citations.txt


In [None]:
# highlight_removed_citations.py

def highlight_removed(all_file, used_file, output_file):
    # Read all citations from files
    with open(all_file, 'r', encoding='utf-8') as f:
        all_citations = [line.strip() for line in f if line.strip()]

    with open(used_file, 'r', encoding='utf-8') as f:
        used_citations = {line.strip() for line in f if line.strip()}

    # Find removed citations (in all but not in used)
    removed = [c for c in all_citations if c not in used_citations]

    # Write results to HTML file with red highlights
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("<html><body><h2>Citation Comparison</h2>\n<ul>\n")
        for citation in all_citations:
            if citation in removed:
                f.write(f"<li><span style='color:red'>{citation}</span></li>\n")
            else:
                f.write(f"<li>{citation}</li>\n")
        f.write("</ul></body></html>")

if __name__ == "__main__":
    highlight_removed("all_citations.txt", "used_citations.txt", "comparison.html")
