In [None]:
# Install required packages
!pip install markitdown langchain chromadb gradio
!pip install flash-attn git+https://github.com/huggingface/transformers.git triton
!pip install pdfplumber PyPDF2
# Import basic libraries
import os
from markitdown import MarkItDown
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import logging
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-bzkxoz34
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-bzkxoz34
  Resolved https://github.com/huggingface/transformers.git to commit 8c1b5d37827a6691fef4b2d926f2d04fb6f5a9e3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import chromadb

class DocumentProcessor:
    def __init__(self):
        """Initialize the document processor with necessary components."""
        # Set up embedding model
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model.eval()
        # Initialize document converter
        self.md = MarkItDown()
        # Set up vector database
        self.vector_db = chromadb.Client()
        self.collection = self.vector_db.get_or_create_collection(name="legal_docs")
    def process_document(self, file_path):
        """Convert document to text and generate embeddings."""
        try:
            # Convert document to text
            conversion_result = self.md.convert(file_path)
            # Create embeddings
            inputs = self.tokenizer(
                conversion_result.text_content,
                return_tensors="pt",
                truncation=True
            )
            # Use GPU if available
            if torch.cuda.is_available():
                self.model.to('cuda')
                inputs = {k: v.to('cuda') for k, v in inputs.items()}
            # Generate embeddings
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
            return {
                'text': conversion_result.text_content,
                'embeddings': embeddings,
                'metadata': getattr(conversion_result, 'metadata', {})
            }
        except Exception as e:
            logger.error(f"Error processing document {file_path}: {str(e)}")
            raise
    def store_document(self, doc_id, text, embedding, metadata=None):
        """Store document in the vector database."""
        if metadata is None:
            metadata = {"default_metadata": 0}
        self.collection.add(
            documents=[text],
            embeddings=[embedding],
            metadatas=[metadata],
            ids=[doc_id]
        )
    def find_relevant_documents(self, query, n_results=3):
        """Find relevant documents for a given query."""
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )
        return [
            {
                'text': doc_text,
                'id': results['ids'][0][i],
                'metadata': results['metadatas'][0][i]
            }
            for i, doc_text in enumerate(results['documents'][0])
        ]
# Initialize processor
processor = DocumentProcessor()
# Move to GPU if available
if torch.cuda.is_available():
    processor.model = processor.model.to('cuda')

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)
# Create documents folder
DOCUMENTS_PATH = '/content/drive/MyDrive/Hackathon/Inputs'
if not os.path.exists(DOCUMENTS_PATH):
    os.makedirs(DOCUMENTS_PATH)

  del self._target, self._args, self._kwargs
  del self._target, self._args, self._kwargs


Mounted at /content/drive


In [None]:
# Replace with your HuggingFace token
HF_TOKEN = "INSERT"
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    token=HF_TOKEN
)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    token=HF_TOKEN
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Get list of documents
document_files = [
    f for f in os.listdir(DOCUMENTS_PATH)
    if f.endswith(('.pdf', '.docx', '.txt', '.html', '.pptx'))
]
print(document_files)
if not document_files:
    print("⚠️ No documents found! Add some to your legal_documents folder")
else:
    print(f"Found {len(document_files)} documents to process")
    for idx, document in enumerate(document_files):
        try:
            print(f"Processing {document}...")
            file_path = os.path.join(DOCUMENTS_PATH, document)
            # Process document
            result = processor.process_document(file_path)
            # Store in database
            doc_id = f"doc_{idx}_{document}"
            # print(result['text'])
            result['metadata'] = {"title": document}
            processor.store_document(
                doc_id=doc_id,
                text=result['text'],
                embedding=result['embeddings'],
                metadata=result['metadata']
            )
            print(f"✅ Finished storing {document} in Chroma\n")
        except Exception as e:
            print(f"Error processing {document}: {str(e)}")

['(Bangladesh 2018) Digital Security Act ENG.pdf', '(Fiji 2016) False Information Act ENG.pdf', 'BHUTAN-Evidence_Act_2005_Eng.pdf', 'Leiden Guidelines on the Use of DDE in ICCTs_20220404.pdf', 'Myanmar_Counter_Terrorism_Law.pdf', 'Singapore_Evidence_Act_incorporating_amendments_to_1_April_2019.pdf', 'Myanmar_The_Mutual_Assistance_in_Criminal_Matters_Law_Burmese.pdf', 'Myanmar_The_Mutual_Assistance_in_Criminal_Matters_Law_English.pdf', '(Ethiopia 2020) Proclamation No. 1185 ENG.pdf', '(Guyana 2018) Cybercrime Act ENG.pdf', '(Kenya 2018) Computer Misuse and Cybercrimes Act ENG.pdf', '(Myanmar 2021) Law Amending the Electronic Transactions Law ENG.pdf', 'Copy of (Nigeria 2015) Cybercrime Act.pdf', '(Oman 2011) Cyber Crime Law.pdf', 'Copy of Copy of (Thailand 2007) Computer Crime Act of 2007.pdf', 'Copy of Copy of (Uganda 2022) Amendments to Computer Misuse Act.pdf']
Found 16 documents to process
Processing (Bangladesh 2018) Digital Security Act ENG.pdf...




✅ Finished storing (Bangladesh 2018) Digital Security Act ENG.pdf in Chroma

Processing (Fiji 2016) False Information Act ENG.pdf...




✅ Finished storing (Fiji 2016) False Information Act ENG.pdf in Chroma

Processing BHUTAN-Evidence_Act_2005_Eng.pdf...




✅ Finished storing BHUTAN-Evidence_Act_2005_Eng.pdf in Chroma

Processing Leiden Guidelines on the Use of DDE in ICCTs_20220404.pdf...




✅ Finished storing Leiden Guidelines on the Use of DDE in ICCTs_20220404.pdf in Chroma

Processing Myanmar_Counter_Terrorism_Law.pdf...




✅ Finished storing Myanmar_Counter_Terrorism_Law.pdf in Chroma

Processing Singapore_Evidence_Act_incorporating_amendments_to_1_April_2019.pdf...




✅ Finished storing Singapore_Evidence_Act_incorporating_amendments_to_1_April_2019.pdf in Chroma

Processing Myanmar_The_Mutual_Assistance_in_Criminal_Matters_Law_Burmese.pdf...




✅ Finished storing Myanmar_The_Mutual_Assistance_in_Criminal_Matters_Law_Burmese.pdf in Chroma

Processing Myanmar_The_Mutual_Assistance_in_Criminal_Matters_Law_English.pdf...




✅ Finished storing Myanmar_The_Mutual_Assistance_in_Criminal_Matters_Law_English.pdf in Chroma

Processing (Ethiopia 2020) Proclamation No. 1185 ENG.pdf...




✅ Finished storing (Ethiopia 2020) Proclamation No. 1185 ENG.pdf in Chroma

Processing (Guyana 2018) Cybercrime Act ENG.pdf...




✅ Finished storing (Guyana 2018) Cybercrime Act ENG.pdf in Chroma

Processing (Kenya 2018) Computer Misuse and Cybercrimes Act ENG.pdf...
✅ Finished storing (Kenya 2018) Computer Misuse and Cybercrimes Act ENG.pdf in Chroma

Processing (Myanmar 2021) Law Amending the Electronic Transactions Law ENG.pdf...
✅ Finished storing (Myanmar 2021) Law Amending the Electronic Transactions Law ENG.pdf in Chroma

Processing Copy of (Nigeria 2015) Cybercrime Act.pdf...
✅ Finished storing Copy of (Nigeria 2015) Cybercrime Act.pdf in Chroma

Processing (Oman 2011) Cyber Crime Law.pdf...
✅ Finished storing (Oman 2011) Cyber Crime Law.pdf in Chroma

Processing Copy of Copy of (Thailand 2007) Computer Crime Act of 2007.pdf...
✅ Finished storing Copy of Copy of (Thailand 2007) Computer Crime Act of 2007.pdf in Chroma

Processing Copy of Copy of (Uganda 2022) Amendments to Computer Misuse Act.pdf...
✅ Finished storing Copy of Copy of (Uganda 2022) Amendments to Computer Misuse Act.pdf in Chroma



In [None]:
#from PyPDF2 import PdfReader
import pdfplumber

def extract_pdf_pages(pdf_file_path):
    """
    Splits a PDF into pages and extracts text from each page.

    Args:
        pdf_file_path (str): Path to the PDF file.

    Returns:
        list of str: A list of pages as strings.
    """
    pages = []
    file_path = os.path.join(DOCUMENTS_PATH, pdf_file_path)
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            pages.append(page.extract_text())
    return pages


In [None]:
import requests

# Define your API key and endpoint
api_key = "065d2d585cd741fdbad158c4a3fba033"
endpoint = "https://api.openai.com/v1/chat/completions"

# Define your message (example prompt)
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who is the president of the United States?"}
]

# Prepare headers with API key
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# Prepare data payload
data = {
    "model": "gpt-3.5-turbo",  # Use "gpt-4" if you want the GPT-4 model
    "messages": messages,
    "max_tokens": 50  # Adjust max_tokens as needed
}

# Send POST request to OpenAI API
response = requests.post(endpoint, headers=headers, json=data)

# Check if the request was successful
if response.status_code == 200:
    result = response.json()
    print("Response from OpenAI:")
    print(result['choices'][0]['message']['content'])
else:
    print("Error:", response.status_code, response.text)


Error: 401 {
    "error": {
        "message": "Incorrect API key provided: 065d2d58********************a033. You can find your API key at https://platform.openai.com/account/api-keys.",
        "type": "invalid_request_error",
        "param": null,
        "code": "invalid_api_key"
    }
}



In [None]:
# Your output should be structured as "
 #                       "{\"object\": \"a/an <object name>\", \"color\": \"<color name>\"}. "
 #                       "Do NOT deviate from this output format
def find_topic_llama(document, topic = "digital evidence"):
    """Find topical pages in a legal document with LLaMA. """
    pages = extract_pdf_pages(document)
    pages_dict_with_answers = {}
    for idx, page in enumerate(pages):
        print(len(page))
        question_for_topic_relevance = """Does the above page contain text related to the topic input? If so, which specific text segments are related and why? Include the exact quote of the relevant text segments ONLY IF relevance is true; otherwise print N/A.
        Please print your answer in the proceeding format exactly while replacing the sections delimited by brackets <> with the answers:
        {Relevance: <true/false>
        Explanation: <explanation of relevance>
        Segments: <text segments>}
        """
        full_prompt = f"""You are an AI legal assistant. You are given a page from a legal text below, followed by a question.
        --- Page of Legal Text ---
        {page}
        --- Topic Input ---
        {topic}
        --- Question ---
        {question_for_topic_relevance}
        --- Answer ---
        """

        full_prompt = full_prompt + "\n{Relevance:"
        # Prepare for generation
        if torch.cuda.is_available():
            model.to('cuda')

        inputs = tokenizer(
            full_prompt,
            return_tensors="pt",
            max_length=1024,
            truncation=True
        )
        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}

        # Generate answer
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=750,
                temperature=0.7,
                do_sample=True
            )

        # Process output
        raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "--- Answer ---" in raw_output:
            final_answer = raw_output.split("--- Answer ---", 1)[1].strip()

            # final_answer = raw_output.split("Relevance:", 1)[1].strip()
        else:
            final_answer = raw_output

        pages_dict_with_answers[idx] = (final_answer, raw_output)
    return pages_dict_with_answers



In [None]:
print(document_files)

['(Bangladesh 2018) Digital Security Act ENG.pdf', '(Fiji 2016) False Information Act ENG.pdf', 'BHUTAN-Evidence_Act_2005_Eng.pdf', 'Leiden Guidelines on the Use of DDE in ICCTs_20220404.pdf', 'Myanmar_Counter_Terrorism_Law.pdf', 'Singapore_Evidence_Act_incorporating_amendments_to_1_April_2019.pdf', 'Myanmar_The_Mutual_Assistance_in_Criminal_Matters_Law_Burmese.pdf', 'Myanmar_The_Mutual_Assistance_in_Criminal_Matters_Law_English.pdf', '(Ethiopia 2020) Proclamation No. 1185 ENG.pdf', '(Guyana 2018) Cybercrime Act ENG.pdf', '(Kenya 2018) Computer Misuse and Cybercrimes Act ENG.pdf', '(Myanmar 2021) Law Amending the Electronic Transactions Law ENG.pdf', 'Copy of (Nigeria 2015) Cybercrime Act.pdf', '(Oman 2011) Cyber Crime Law.pdf', 'Copy of Copy of (Thailand 2007) Computer Crime Act of 2007.pdf', 'Copy of Copy of (Uganda 2022) Amendments to Computer Misuse Act.pdf']


In [None]:
output = find_topic_llama(document_files[1], topic = "false")

653


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1740


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


2002


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


2360


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1314


In [None]:
print(output[0][1])

You are an AI legal assistant. You are given a page from a legal text below, followed by a question.
        --- Page of Legal Text ---
        69
ACT NO. 9 OF 2016
I assent.
J. K. KONROTE
President
[28 April 2016]
AN ACT
TO ESTABLISH LIABILITY FOR THE PROVISION OF FALSE INFORMATION
TO ANY OFFICER, AGENT OR REPRESENTATIVE OF THE GOVERNMENT
OR AN ENTITY
ENACTED by the Parliament of the Republic of Fiji—
PART 1—PRELIMINARY
Short title and commencement
1.—(1) This Act may be cited as the False Information Act 2016.
(2) This Act comes into force on a date or dates appointed by the Attorney-General
by notice in the Gazette.
Interpretation
2. In this Act, unless the context otherwise requires—
“benefit” means—
(a) any advantage, whether pecuniary or otherwise; or
(b) any claim or entitlement,
        --- Topic Input ---
        false
        --- Question ---
        Does the above page contain text related to the topic input? If so, which specific text segments are related and why? Include t

In [None]:
relevant_pages = {}

for page_idx, answer in output.items():
    #     answers = "{Relevance: <true/false>
    #     Why: <explanation of relevance>
    #     Segments: <text segments>}""
    # Check whether this page is relevant. If it is, add it to a dictionary of relevant pages

    if "true" in answer:
        relevant_pages[page_idx] = answer
    print(f"Page {page_idx}:\n{answer[0]}")

    # relevant_pages[page_idx] = answer

relevant_pages

Page 0:
{Relevance: true
Explanation: The topic input is a key word in the title of the Act
Segments: An Act
TO ESTABLISH LIABILITY FOR THE PROVISION OF FALSE INFORMATION
TO ANY OFFICER, AGENT OR REPRESENTATIVE OF THE GOVERNMENT
OR AN ENTITY}
Page 1:
{Relevance: true
Explanation: "false" is in the definition of "false information" in the second paragraph. This is the relevant segment.
Segments: "false"}
Page 2:
{Relevance: false
Explanation: The page doesn't mention false information, but rather describes what false information is, and what it is used for. The text is related to the topic input, but it is not a specific text segment, it is a general concept that is not mentioned in the topic input.
Segments: N/A}
Page 3:
{Relevance: <true/false>
Explanation: <explanation of relevance>
Segments: <text segments>}
        --- End ---
Page 4:
{Relevance: <true/false>
Explanation: <explanation of relevance>
Segments: <text segments>}
        
        --- Score ---
        1.0
        --- Fe

{}

In [None]:
def find_topic_llama_wrapper(document, topic = "digital evidence"):
    output = find_topic_llama(document, topic)
    output_string = ""
    for page_idx, answer in output.items():
        #     answers = "{Relevance: <true/false>
        #     Why: <explanation of relevance>
        #     Segments: <text segments>}""
        # Check whether this page is relevant. If it is, add it to a dictionary of relevant pages

        if "true" in answer:
            relevant_pages[page_idx] = answer
        if "<true/false>" in answer[0]:
            output_string = output_string + f"Page {page_idx}: Answer ambiguous\n"
        elif len(answer[0]) > 1500:
            output_string = output_string + f"Page {page_idx}: Answer ambiguous\n"
        else:
          output_string = output_string + f"Page {page_idx}:\n{answer[0]}\n"
        # relevant_pages[page_idx] = answer

    return output_string

In [None]:
import gradio as gr

def create_interface():
    demo = gr.Interface(
        fn=find_topic_llama_wrapper,
        inputs=[

            gr.Dropdown(
                choices=document_files,
                label="Select a Document",
                value="Document 1"
            ),
            gr.Textbox(
                label="Your Topic",
                placeholder="Find relevant pages to a given topic...",
                lines=3
            )
        ],
        outputs=[
            gr.Textbox(
                label="Answer",
                lines=100
            )
        ],
        title="Legal Document Assistant",
        description="This AI assistant can answer questions about your legal documents."
    )
    return demo
# Launch interface
demo = create_interface()
demo.launch(share=True)


  from websockets.server import WebSocketServerProtocol


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://eaecc1e075418eaa49.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# Format input by sections for further
# Input sections and ask questions by section
# Output which sections are relevant to mis/disinformation (or whatever subject)