In [None]:
%pip install -qU langchain-groq
%pip install langchain pinecone-client python-dotenv streamlit
%pip install -U langchain-community
%pip install sentence-transformers
%pip install pinecone-client
%pip install pinecone-client[grpc]
%pip install --upgrade langchain-pinecone
%pip install pymupdf pdfplumber
%pip install xmltodict

Project Description: Legal Document Analysis and Summarization Tool using RAG

In [7]:
import getpass
import os

os.environ["GROQ_API_KEY"] = getpass.getpass()

# Groq is used as the LLM
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

In [4]:
#reading single xml file

import xml.etree.ElementTree as ET

# Function to parse a single XML file and define the structure
def parse_xml(file_path):
    try:
        # Parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        # Define the structure of case_data based on the XML content
        case_data = {
            "name": root.find('name').text,
            "AustLII": root.find('AustLII').text,
            "catchphrases": [cp.text for cp in root.find('catchphrases').findall('catchphrase')],
            "sentences": [s.text for s in root.find('sentences').findall('sentence')]
        }
        return case_data
    except ET.ParseError as e:
        print(f"Error parsing {file_path}: {e}")
    except Exception as e:
        print(f"Unexpected error parsing {file_path}: {e}")

file_path = r"D:\Rohan\ML\Datasets\legal_corpus_australia\fulltext\06_3.xml"
parsed_data = parse_xml(file_path)
print(parsed_data)

Error parsing D:\Rohan\ML\Datasets\legal_corpus_australia\fulltext\06_3.xml: not well-formed (invalid token): line 6, column 13
None


In [11]:
# reading and storing legal data

import xml.etree.ElementTree as ET
import os


def parse_fulltext(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    case_data = {
        "name": root.find("name").text,
        "AustLII": root.find("AustLII").text,
        "catchphrases": [
            cp.text for cp in root.find("catchphrases").findall("catchphrase")
        ],
        "sentences": [s.text for s in root.find("sentences").findall("sentence")],
    }
    return case_data


def parse_citations_summ(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    case_data = {
        "name": root.find("name").text,
        "AustLII": root.find("AustLII").text,
        "citphrases": [
            {
                "id": cp.attrib["id"],
                "type": cp.attrib["type"],
                "from": cp.attrib["from"],
                "text": cp.text,
            }
            for cp in root.find("citphrases").findall("citphrase")
        ],
        "citances": [
            {"id": ca.attrib["id"], "from": ca.attrib["from"], "text": ca.text}
            for ca in root.find("citances").findall("citance")
        ],
        "legistitles": [lt.text for lt in root.find("legistitles").findall("title")],
    }
    return case_data


def parse_citations_class(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    case_data = {
        "name": root.find("name").text,
        "AustLII": root.find("AustLII").text,
        "citations": [
            {
                "id": c.attrib["id"],
                "class": c.find("class").text,
                "tocase": c.find("tocase").text,
                "AustLII": c.find("AustLII").text,
                "text": c.find("text").text,
            }
            for c in root.find("citations").findall("citation")
        ],
    }
    return case_data


# Helper function to parse all XML files in a folder
def parse_folder(folder_path, parse_function):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".xml"):
            file_path = os.path.join(folder_path, file_name)
            data.append(parse_function(file_path))
    return data


# Example usage
fulltext_data = parse_folder(
    r"D:\Rohan\ML\Datasets\legal_corpus_australia\fulltext", parse_fulltext
)
citations_summ_data = parse_folder(
    r"D:\Rohan\ML\Datasets\legal_corpus_australia\citations_summ", parse_citations_summ
)
citations_class_data = parse_folder(
    r"D:\Rohan\ML\Datasets\legal_corpus_australia\citations_class", parse_citations_class
)

# Print sample data
print(fulltext_data[0])
print(citations_summ_data[0])
print(citations_class_data[0])

ParseError: not well-formed (invalid token): line 6, column 13 (<string>)