In [102]:
import pandas as pd
import xml.etree.ElementTree as ET
import re
import os
import html
import json

In [103]:
def clean_xml_citations(xml_content):
    pattern = r'<citation\s+(.*?)>'
    
    def fix_citation(match):
        attributes = match.group(1)
        attributes = attributes.strip('"')
        attr_list = attributes.split()
        fixed_attrs = []
        for attr in attr_list:
            if '=' in attr:
                name, value = attr.split('=', 1)
                value = value.strip('"\'')
                fixed_attrs.append(f'{name}="{value}"')
            else:
                fixed_attrs.append(attr)
        return '<citation ' + ' '.join(fixed_attrs) + '>'
    
    cleaned_xml = re.sub(pattern, fix_citation, xml_content)
    return cleaned_xml

def escape_ampersands(xml_content):
    pattern = r'&(?!(?:amp|lt|gt|apos|quot|#\d+);)'
    return re.sub(pattern, '&amp;', xml_content)


def clean_xml_content(xml_content):
    try:
        # Escape ampersands
        xml_content = escape_ampersands(xml_content)

        # Clean the XML content
        cleaned_xml = clean_xml_citations(xml_content)

        # Parse the cleaned XML
        parser = ET.XMLParser(encoding="utf-8")
        root = ET.fromstring(cleaned_xml, parser=parser)

        return root  # Return the root element for further processing if needed

    except ET.ParseError as e:
        print(f"Error parsing XML file: {e}")
        
        # Print the problematic section of the XML
        lines = cleaned_xml.split('\n')
        error_line = int(str(e).split('line')[1].split(',')[0].strip())
        start_line = max(0, error_line - 3)
        end_line = min(len(lines), error_line + 2)
        
        print("\nProblematic section of the XML file:")
        for i in range(start_line, end_line):
            print(f"{i+1}: {lines[i]}")
            if i + 1 == error_line:
                print(" " * (len(str(i+1)) + 2 + int(str(e).split('column')[1].split(':')[0].strip()) - 1) + "^")

    except Exception as e:
        print(f"An error occurred: {e}")

In [104]:
def xml_to_dict(element):
    result = {}
    if element.items():
        result.update(dict(element.items()))
    if element.text:
        text = element.text.strip()
        if text:
            if element.tag in result:
                # If we already have a dict for this tag, make it a list
                if not isinstance(result[element.tag], list):
                    result[element.tag] = [result[element.tag]]
                result[element.tag].append(text)
            else:
                result[element.tag] = text
    for child in element:
        child_result = xml_to_dict(child)
        if child.tag in result:
            if not isinstance(result[child.tag], list):
                result[child.tag] = [result[child.tag]]
            result[child.tag].append(child_result)
        else:
            result[child.tag] = child_result
    return result

In [105]:
def select_data(parsed_dict):
    selected_data = parsed_dict.get("citations", {}).get("citation", [])
    flattened_data_list = []
    
    # Ensure selected_data is always a list
    if not isinstance(selected_data, list):
        selected_data = [selected_data]
    
    for line in selected_data:
        # Check if line is a dictionary
        if isinstance(line, dict):
            flattened_data = {
                'id': line.get('id', ''),
                'class': line.get('class', {}).get('class', '') if isinstance(line.get('class'), dict) else line.get('class', ''),
                'tocase': line.get('tocase', {}).get('tocase', '') if isinstance(line.get('tocase'), dict) else line.get('tocase', ''),
                'text': line.get('text', {}).get('text', '') if isinstance(line.get('text'), dict) else line.get('text', '')
            }
        else:
            # If line is not a dictionary, create a single-column dataframe with the string
            flattened_data = {'content': str(line)}
        
        flattened_data_list.append(flattened_data)
    
    # Create a DataFrame from the list of flattened dictionaries
    df = pd.DataFrame(flattened_data_list)
    
    return df

In [106]:
def read_file_with_fallback_encoding(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    return None

def process_xml_files(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith('.xml'):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename[:-4] + '.json')
            
            xml_content = read_file_with_fallback_encoding(input_path)
            if xml_content is None:
                continue
            xml_root = clean_xml_content(xml_content)
            parsed_dict = xml_to_dict(xml_root)
            df = select_data(parsed_dict)
            json_data = df.to_json()
            if json_data:
                with open(output_path, 'w', encoding='utf-8') as file:
                    json.dump(json_data, file, indent=2, ensure_ascii=False)
                print(f"Processed and saved as JSON: {filename}")
            else:
                print(f"Failed to process: {filename}")

def main():
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(os.path.dirname(current_dir))
    input_folder = os.path.join(parent_dir, "AustLII-Legal-Case-Report", "dataset", "corpus", "citations_class")
    output_folder = os.path.join(parent_dir, "AustLII-Legal-Case-Report", "dataset", "cleaned_dataset", "citations_class")

    process_xml_files(input_folder, output_folder)

In [107]:
main()

Processed and saved as JSON: 06_1.xml
Processed and saved as JSON: 06_1001.xml
Processed and saved as JSON: 06_1004.xml
Processed and saved as JSON: 06_1005.xml
Processed and saved as JSON: 06_1017.xml
Processed and saved as JSON: 06_1019.xml
Processed and saved as JSON: 06_102.xml
Processed and saved as JSON: 06_1021.xml
Processed and saved as JSON: 06_1022.xml
Processed and saved as JSON: 06_1023.xml
Processed and saved as JSON: 06_1027.xml
Processed and saved as JSON: 06_1033.xml
Processed and saved as JSON: 06_1041.xml
Processed and saved as JSON: 06_1042.xml
Processed and saved as JSON: 06_1043.xml
Processed and saved as JSON: 06_1044.xml
Processed and saved as JSON: 06_1045.xml
Processed and saved as JSON: 06_1050.xml
Processed and saved as JSON: 06_1054.xml
Processed and saved as JSON: 06_106.xml
Processed and saved as JSON: 06_1066.xml
Processed and saved as JSON: 06_1069.xml
Processed and saved as JSON: 06_107.xml
Processed and saved as JSON: 06_1071.xml
Processed and saved as

In [114]:
import pandas as pd
import json

# Replace 'your_file.json' with the path to your JSON file
with open('C:/Users/gdbt0/PycharmProjects/AustLII-Legal-Case-Report/dataset/cleaned_dataset/citations_class/06_1.json', 'r') as file:
    data = json.load(file)


print(type(data))
print(data)


<class 'str'>
{"id":{"0":"c0","1":"c1","2":"c2","3":"c3","4":"c4","5":"c5","6":"c6","7":"c7","8":"c8","9":"c9","10":"c10","11":"c11","12":"c12","13":"c13","14":"c14","15":"c15","16":"c16"},"class":{"0":"cited","1":"cited","2":"cited","3":"cited","4":"cited","5":"cited","6":"cited","7":"cited","8":"discussed","9":"discussed","10":"cited","11":"cited","12":"cited","13":"discussed","14":"discussed","15":"discussed","16":"cited"},"tocase":{"0":"Universal Music Australia Pty Ltd v Sharman License Holdings Ltd (2005) 220 ALR 1","1":"Universal Music Australia Pty Ltd v Sharman License Holdings Ltd [2005] FCA 406","2":"Universal Music Australia Pty Ltd v Sharman License Holdings Ltd [2005] FCA 441","3":"Sharman License Holdings Ltd v Universal Music Australia Pty Ltd [2005] FCA 505","4":"Sharman License Holdings Ltd v Universal Music Australia Pty Ltd [2005] FCA 802","5":"D&eacute;cor Corporation Pty Ltd v Dart Industries Inc (1991) 33 FCR 397","6":"Bomanite Pty Ltd v Slatex Corp Australia Pty

In [116]:
# If data is a list of dictionaries
if isinstance(data, list):
    df = pd.DataFrame(data)

# If data is a dictionary
elif isinstance(data, dict):
    # If it's a simple dictionary
    df = pd.DataFrame([data])
    
    # If it's a nested dictionary
    # df = pd.json_normalize(data)

# Display the dataframe
df.head()

Unnamed: 0,id,class,tocase,text
0,c0,cited,Universal Music Australia Pty Ltd v Sharman Li...,2 Wilcox J delivered judgment on the complex i...
1,c1,cited,Universal Music Australia Pty Ltd v Sharman Li...,2 Wilcox J delivered judgment on the complex i...
2,c2,cited,Universal Music Australia Pty Ltd v Sharman Li...,2 Wilcox J delivered judgment on the complex i...
3,c3,cited,Sharman License Holdings Ltd v Universal Music...,2 Wilcox J delivered judgment on the complex i...
4,c4,cited,Sharman License Holdings Ltd v Universal Music...,2 Wilcox J delivered judgment on the complex i...


In [117]:
df.iloc[0]["text"]

"2 Wilcox J delivered judgment on the complex issues of liability arising in the primary proceedings on 5 September 2005 ( Universal Music Australia Pty Ltd v Sharman License Holdings Ltd (2005) 220 ALR 1). In the meantime, Ms Hemming had filed two disclosure affidavits pursuant to Wilcox J's orders of 22 March 2005 whilst Sharman License and Sharman Networks had unsuccessfully sought several stays on various grounds of that same order insofar as it applied to them (see Universal Music Australia Pty Ltd v Sharman License Holdings Ltd [2005] FCA 406 per Hely J, delivered 8 April 2005; Universal Music Australia Pty Ltd v Sharman License Holdings Ltd [2005] FCA 441 per Wilcox J, delivered 15 April 2005 and Sharman License Holdings Ltd v Universal Music Australia Pty Ltd [2005] FCA 505 per Moore J, delivered 28 April 2005). Disclosure affidavits were eventually sworn on behalf of Sharman License and Sharman Networks by Mr Gee on 19 April 2005, which were later superseded by further affidav