In [51]:
import re
import os
import json

def extract_citphrases(xml_content):
    pattern = r'<citphrase\s+(.*?)>(.*?)</citphrase>'
    matches = re.findall(pattern, xml_content, re.DOTALL)
    
    citphrases = []
    for attrs, text in matches:
        citphrase = {'text': text.strip()}
        # Corrected and improved regex for attribute extraction
        attr_matches = re.findall(r'(\w+)\s*=\s*["\']?([^"\'>\s]+)["\']?', attrs)
        for key, value in attr_matches:
            citphrase[key] = value
        citphrases.append(citphrase)
    
    return citphrases

def read_file_with_fallback_encoding(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    return None

def process_xml_files(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    total_files = 0
    cleaned_files = 0
    error_files = 0

    for filename in os.listdir(input_folder):
        if filename.endswith('.xml'):
            total_files += 1
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename[:-4] + '.json')
            
            try:
                xml_content = read_file_with_fallback_encoding(input_path)
                if xml_content is None:
                    print(f"Failed to read file with supported encodings: {filename}")
                    error_files += 1
                    continue
                
                citphrases = extract_citphrases(xml_content)
                
                if citphrases:
                    with open(output_path, 'w', encoding='utf-8') as file:
                        json.dump(citphrases, file, indent=2, ensure_ascii=False)
                    print(f"Processed and saved as JSON: {filename}")
                    cleaned_files += 1
                else:
                    print(f"No citphrases found in: {filename}")
                    error_files += 1
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
                error_files += 1

    print(f"\nProcessing complete. Summary:")
    print(f"Total files processed: {total_files}")
    print(f"Successfully cleaned files: {cleaned_files}")
    print(f"Files with errors: {error_files}")

def main():
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(os.path.dirname(current_dir))
    input_folder = os.path.join(parent_dir, "AustLII-Legal-Case-Report", "dataset", "corpus", "citations_summ")
    output_folder = os.path.join(parent_dir, "AustLII-Legal-Case-Report", "dataset", "cleaned_dataset", "citations_summ")

    process_xml_files(input_folder, output_folder)

if __name__ == "__main__":
    main()

Processed and saved as JSON: 06_1.xml
No citphrases found in: 06_100.xml
Processed and saved as JSON: 06_1001.xml
Processed and saved as JSON: 06_1004.xml
Processed and saved as JSON: 06_1005.xml
No citphrases found in: 06_1006.xml
No citphrases found in: 06_1015.xml
Processed and saved as JSON: 06_1017.xml
Processed and saved as JSON: 06_1018.xml
Processed and saved as JSON: 06_102.xml
Processed and saved as JSON: 06_1021.xml
Processed and saved as JSON: 06_1022.xml
Processed and saved as JSON: 06_1023.xml
Processed and saved as JSON: 06_1026.xml
Processed and saved as JSON: 06_1027.xml
Processed and saved as JSON: 06_1028.xml
Processed and saved as JSON: 06_1029.xml
No citphrases found in: 06_1032.xml
Processed and saved as JSON: 06_1033.xml
Processed and saved as JSON: 06_1041.xml
Processed and saved as JSON: 06_1042.xml
Processed and saved as JSON: 06_1043.xml
Processed and saved as JSON: 06_1044.xml
Processed and saved as JSON: 06_1045.xml
Processed and saved as JSON: 06_1046.xml
