In [1]:
# Install required packages
!pip install markitdown langchain chromadb gradio
!pip install flash-attn git+https://github.com/huggingface/transformers.git triton
!pip install pdfplumber PyPDF2
# Import basic libraries
import os
from markitdown import MarkItDown
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import logging
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

Collecting markitdown
  Downloading markitdown-0.0.1a3-py3-none-any.whl.metadata (4.8 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting gradio
  Downloading gradio-5.12.0-py3-none-any.whl.metadata (16 kB)
Collecting mammoth (from markitdown)
  Downloading mammoth-1.9.0-py2.py3-none-any.whl.metadata (24 kB)
Collecting markdownify (from markitdown)
  Downloading markdownify-0.14.1-py3-none-any.whl.metadata (8.5 kB)
Collecting pathvalidate (from markitdown)
  Downloading pathvalidate-3.2.3-py3-none-any.whl.metadata (12 kB)
Collecting pdfminer-six (from markitdown)
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting puremagic (from markitdown)
  Downloading puremagic-1.28-py3-none-any.whl.metadata (5.8 kB)
Collecting pydub (from markitdown)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-pptx (from markitdown)
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)




In [2]:
import chromadb

class DocumentProcessor:
    def __init__(self):
        """Initialize the document processor with necessary components."""
        # Set up embedding model
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model.eval()
        # Initialize document converter
        self.md = MarkItDown()
        # Set up vector database
        self.vector_db = chromadb.Client()
        self.collection = self.vector_db.get_or_create_collection(name="legal_docs")
    def process_document(self, file_path):
        """Convert document to text and generate embeddings."""
        try:
            # Convert document to text
            conversion_result = self.md.convert(file_path)
            # Create embeddings
            inputs = self.tokenizer(
                conversion_result.text_content,
                return_tensors="pt",
                truncation=True
            )
            # Use GPU if available
            if torch.cuda.is_available():
                self.model.to('cuda')
                inputs = {k: v.to('cuda') for k, v in inputs.items()}
            # Generate embeddings
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
            return {
                'text': conversion_result.text_content,
                'embeddings': embeddings,
                'metadata': getattr(conversion_result, 'metadata', {})
            }
        except Exception as e:
            logger.error(f"Error processing document {file_path}: {str(e)}")
            raise
    def store_document(self, doc_id, text, embedding, metadata=None):
        """Store document in the vector database."""
        if metadata is None:
            metadata = {"default_metadata": 0}
        self.collection.add(
            documents=[text],
            embeddings=[embedding],
            metadatas=[metadata],
            ids=[doc_id]
        )
    def find_relevant_documents(self, query, n_results=3):
        """Find relevant documents for a given query."""
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )
        return [
            {
                'text': doc_text,
                'id': results['ids'][0][i],
                'metadata': results['metadatas'][0][i]
            }
            for i, doc_text in enumerate(results['documents'][0])
        ]
# Initialize processor
processor = DocumentProcessor()
# Move to GPU if available
if torch.cuda.is_available():
    processor.model = processor.model.to('cuda')

  and should_run_async(code)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

  frame = None
  from jax import xla_computation as _xla_computation


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [3]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)
# Create documents folder
DOCUMENTS_PATH = '/content/drive/MyDrive/Hackathon/Inputs'
if not os.path.exists(DOCUMENTS_PATH):
    os.makedirs(DOCUMENTS_PATH)

  and should_run_async(code)


Mounted at /content/drive


In [5]:
# Replace with your HuggingFace token
HF_TOKEN = "INSERT_KEY"
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    token=HF_TOKEN
)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    token=HF_TOKEN
)


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [10]:
# Get list of documents
document_files = [
    f for f in os.listdir(DOCUMENTS_PATH)
    if f.endswith(('.pdf', '.docx', '.txt', '.html', '.pptx'))
]
print(document_files)
if not document_files:
    print("⚠️ No documents found! Add some to your legal_documents folder")
else:
    print(f"Found {len(document_files)} documents to process")
    for idx, document in enumerate(document_files):
        try:
            print(f"Processing {document}...")
            file_path = os.path.join(DOCUMENTS_PATH, document)
            # Process document
            result = processor.process_document(file_path)
            # Store in database
            doc_id = f"doc_{idx}_{document}"
            # print(result['text'])
            result['metadata'] = {"title": document}
            processor.store_document(
                doc_id=doc_id,
                text=result['text'],
                embedding=result['embeddings'],
                metadata=result['metadata']
            )
            print(f"✅ Finished storing {document} in Chroma\n")
        except Exception as e:
            print(f"Error processing {document}: {str(e)}")

['(Bangladesh 2018) Digital Security Act ENG.pdf', '(Fiji 2016) False Information Act ENG.pdf', '(Oman 2011) Cyber Crime Law.pdf', '(Ethiopia 2020) Proclamation No. 1185 ENG.pdf', '(Guyana 2018) Cybercrime Act ENG.pdf', '(Kenya 2018) Computer Misuse and Cybercrimes Act ENG.pdf', '(Myanmar 2021) Law Amending the Electronic Transactions Law ENG.pdf', '(Uganda 2022) Amendments to Computer Misuse Act.pdf', '(Thailand 2007) Computer Crime Act of 2007.pdf', '(Nigeria 2015) Cybercrime Act.pdf', '(Bhutan 2005) Evidence Act ENG.pdf', '(Singapore 2019) Amendments to the Evidence Act ENG.pdf', '(Myanmar 2014) Counter Terrorism Law ENG.pdf', '(Myanmar 2004) Mutual Assistance in Criminal Matters Law BRM.pdf', '(Myanmar 2004) Mutual Assistance in Criminal Matters Law ENG.pdf.pdf', 'Leiden Guidelines on the Use of DDE in ICCTs.pdf']
Found 16 documents to process
Processing (Bangladesh 2018) Digital Security Act ENG.pdf...




✅ Finished storing (Bangladesh 2018) Digital Security Act ENG.pdf in Chroma

Processing (Fiji 2016) False Information Act ENG.pdf...




✅ Finished storing (Fiji 2016) False Information Act ENG.pdf in Chroma

Processing (Oman 2011) Cyber Crime Law.pdf...




✅ Finished storing (Oman 2011) Cyber Crime Law.pdf in Chroma

Processing (Ethiopia 2020) Proclamation No. 1185 ENG.pdf...




✅ Finished storing (Ethiopia 2020) Proclamation No. 1185 ENG.pdf in Chroma

Processing (Guyana 2018) Cybercrime Act ENG.pdf...




✅ Finished storing (Guyana 2018) Cybercrime Act ENG.pdf in Chroma

Processing (Kenya 2018) Computer Misuse and Cybercrimes Act ENG.pdf...




✅ Finished storing (Kenya 2018) Computer Misuse and Cybercrimes Act ENG.pdf in Chroma

Processing (Myanmar 2021) Law Amending the Electronic Transactions Law ENG.pdf...




✅ Finished storing (Myanmar 2021) Law Amending the Electronic Transactions Law ENG.pdf in Chroma

Processing (Uganda 2022) Amendments to Computer Misuse Act.pdf...
✅ Finished storing (Uganda 2022) Amendments to Computer Misuse Act.pdf in Chroma

Processing (Thailand 2007) Computer Crime Act of 2007.pdf...




✅ Finished storing (Thailand 2007) Computer Crime Act of 2007.pdf in Chroma

Processing (Nigeria 2015) Cybercrime Act.pdf...




✅ Finished storing (Nigeria 2015) Cybercrime Act.pdf in Chroma

Processing (Bhutan 2005) Evidence Act ENG.pdf...




✅ Finished storing (Bhutan 2005) Evidence Act ENG.pdf in Chroma

Processing (Singapore 2019) Amendments to the Evidence Act ENG.pdf...




✅ Finished storing (Singapore 2019) Amendments to the Evidence Act ENG.pdf in Chroma

Processing (Myanmar 2014) Counter Terrorism Law ENG.pdf...




✅ Finished storing (Myanmar 2014) Counter Terrorism Law ENG.pdf in Chroma

Processing (Myanmar 2004) Mutual Assistance in Criminal Matters Law BRM.pdf...




✅ Finished storing (Myanmar 2004) Mutual Assistance in Criminal Matters Law BRM.pdf in Chroma

Processing (Myanmar 2004) Mutual Assistance in Criminal Matters Law ENG.pdf.pdf...




✅ Finished storing (Myanmar 2004) Mutual Assistance in Criminal Matters Law ENG.pdf.pdf in Chroma

Processing Leiden Guidelines on the Use of DDE in ICCTs.pdf...




✅ Finished storing Leiden Guidelines on the Use of DDE in ICCTs.pdf in Chroma



In [11]:
#from PyPDF2 import PdfReader
import pdfplumber

def extract_pdf_pages(pdf_file_path):
    """
    Splits a PDF into pages and extracts text from each page.

    Args:
        pdf_file_path (str): Path to the PDF file.

    Returns:
        list of str: A list of pages as strings.
    """
    pages = []
    file_path = os.path.join(DOCUMENTS_PATH, pdf_file_path)
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            pages.append(page.extract_text())
    return pages


In [12]:
# Your output should be structured as "
 #                       "{\"object\": \"a/an <object name>\", \"color\": \"<color name>\"}. "
 #                       "Do NOT deviate from this output format
def find_topic_llama(document, topic = "digital evidence"):
    """Find topical pages in a legal document with LLaMA. """
    pages = extract_pdf_pages(document)
    pages_dict_with_answers = {}
    for idx, page in enumerate(pages):
        print(len(page))
        question_for_topic_relevance = """Does the above page contain text related to the topic input? If so, which specific text segments are related and why? Include the exact quote of the relevant text segments ONLY IF relevance is true; otherwise print N/A.
        Please print your answer in the proceeding format exactly while replacing the sections delimited by brackets <> with the answers:
        {Relevance: <true/false>
        Explanation: <explanation of relevance>
        Segments: <text segments>}
        """
        full_prompt = f"""You are an AI legal assistant. You are given a page from a legal text below, followed by a question.
        --- Page of Legal Text ---
        {page}
        --- Topic Input ---
        {topic}
        --- Question ---
        {question_for_topic_relevance}
        --- Answer ---
        """

        full_prompt = full_prompt + "\n{Relevance:"
        # Prepare for generation
        if torch.cuda.is_available():
            model.to('cuda')

        inputs = tokenizer(
            full_prompt,
            return_tensors="pt",
            max_length=1024,
            truncation=True
        )
        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}

        # Generate answer
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=750,
                temperature=0.7,
                do_sample=True
            )

        # Process output
        raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "--- Answer ---" in raw_output:
            final_answer = raw_output.split("--- Answer ---", 1)[1].strip()

            # final_answer = raw_output.split("Relevance:", 1)[1].strip()
        else:
            final_answer = raw_output

        pages_dict_with_answers[idx] = (final_answer, raw_output)
    return pages_dict_with_answers



In [17]:
def find_topic_llama_wrapper(document, topic = "digital evidence"):
    output = find_topic_llama(document, topic)
    output_string = ""
    for page_idx, answer in output.items():
        #     answers = "{Relevance: <true/false>
        #     Why: <explanation of relevance>
        #     Segments: <text segments>}""
        # Check whether this page is relevant. If it is, add it to a dictionary of relevant pages

        if "true" in answer:
            relevant_pages[page_idx] = answer
        if "<true/false>" in answer[0]:
            output_string = output_string + f"Page {page_idx}: Answer ambiguous\n"
        elif len(answer[0]) > 1500:
            output_string = output_string + f"Page {page_idx}: Answer ambiguous\n"
        else:
          output_string = output_string + f"Page {page_idx}:\n{answer[0]}\n"
        # relevant_pages[page_idx] = answer

    return output_string

In [18]:
import gradio as gr

def create_interface():
    demo = gr.Interface(
        fn=find_topic_llama_wrapper,
        inputs=[

            gr.Dropdown(
                choices=document_files,
                label="Select a Document",
                value="Document 1"
            ),
            gr.Textbox(
                label="Your Topic",
                placeholder="Find relevant pages to a given topic...",
                lines=3
            )
        ],
        outputs=[
            gr.Textbox(
                label="Answer",
                lines=100
            )
        ],
        title="Legal Document Assistant",
        description="This AI assistant can answer questions about your legal documents."
    )
    return demo
# Launch interface
demo = create_interface()
demo.launch(share=True)


  from websockets.server import WebSocketServerProtocol


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://810cd1b4b9d2897a8d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


