In [1]:
import os 
import re 
import json 

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI

from dotenv import load_dotenv


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
with open('./data/pakistan_laws.json', "r", encoding="utf-8") as file:
    data = json.load(file)

In [4]:
# assume `data` is your loaded list-of-dicts
print("items:", len(data))
print("keys of first item:", data[0].keys())
print("file_name:", data[0]['file_name'])
print(data[0]['text'][:500])   # peek first 500 chars

items: 100
keys of first item: dict_keys(['file_name', 'text'])
file_name: administrator00532129aba2e10fe634ab8fbd94c50b.pdf.txt
 
Page 1 of 19  
 
 
 
THE PRIVATISATION COMMISSION ORDINANCE, 2000  
 
 
 
 
 
CONTENTS  
 
 
PART I.—GENERAL  
SECTIONS:  
 
1. Short title, extent and commencement.  
2. Definitions.  
PART II.—PRIVATISATION COMMISSION  
3. Establishment of the Commission.  
4. Location of Office.  
5. Functions and Powers of the Commission.  
PART III. —MANAGEMENT AND ADMINISTRATION  
6. Board of the Commission.  
7. Chairman, Secretary and members.  
8. Meetings of the Board.  
9. Delegation.  
10. Employee


In [7]:
def convert_unicode_escape_to_text(text):
    if not isinstance(text, str):
        return text.encode("utf-8").decode("unicode-escape")
    return text 


In [5]:
def remove_headers_footers(text):
    t = re.sub(r'Page\s*\d+\s*of\s*\d+', ' ', text, flags=re.IGNORECASE)
    t = "\n".join([line for line in t.splitlines() if len(line.strip())>2 or line.strip().endswith('.')])
    t = re.sub(r'\n{2,}', '\n\n', t).strip()
    return t 

In [12]:
cleaned_data = []
for entry in data:
    raw = entry['text']
    raw = convert_unicode_escape_to_text(raw)
    raw = remove_headers_footers(raw)
    temp_data = {
        'file_name': entry['file_name'],
        'text': raw
    }
    cleaned_data.append(temp_data)

cleaned_data = cleaned_data[:50]

In [13]:
print(cleaned_data[0]["file_name"])
print(cleaned_data[0]["text"][:800])

administrator00532129aba2e10fe634ab8fbd94c50b.pdf.txt
THE PRIVATISATION COMMISSION ORDINANCE, 2000  
CONTENTS  
PART I.—GENERAL  
SECTIONS:  
1. Short title, extent and commencement.  
2. Definitions.  
PART II.—PRIVATISATION COMMISSION  
3. Establishment of the Commission.  
4. Location of Office.  
5. Functions and Powers of the Commission.  
PART III. —MANAGEMENT AND ADMINISTRATION  
6. Board of the Commission.  
7. Chairman, Secretary and members.  
8. Meetings of the Board.  
9. Delegation.  
10. Employees of the Commission.  
11. Employment of agents, advisers and consultants.  
12. Public Servants.  
13. Disclosure of interest.  
PART IV. —FINANCIAL PROVISIONS  
14. Funds of the Commission.  
15. Expenditure to be charged on the Commission Account.  
16. Privatisation Fund.  
17. Power to obtain finance and r eceive grants.  
18. Inves


Data Injestion Pipeline

In [14]:
documents = [ Document(page_content=entry['text'], metadata={'source': entry['file_name']}) for entry in cleaned_data]

In [15]:
print(documents[0].page_content[:500])
print(documents[0].metadata)

THE PRIVATISATION COMMISSION ORDINANCE, 2000  
CONTENTS  
PART I.—GENERAL  
SECTIONS:  
1. Short title, extent and commencement.  
2. Definitions.  
PART II.—PRIVATISATION COMMISSION  
3. Establishment of the Commission.  
4. Location of Office.  
5. Functions and Powers of the Commission.  
PART III. —MANAGEMENT AND ADMINISTRATION  
6. Board of the Commission.  
7. Chairman, Secretary and members.  
8. Meetings of the Board.  
9. Delegation.  
10. Employees of the Commission.  
11. Employment o
{'source': 'administrator00532129aba2e10fe634ab8fbd94c50b.pdf.txt'}


In [16]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

In [17]:
docs = text_splitter.split_documents(documents)

In [18]:
print(len(docs))  # number of chunks
print(docs[0].page_content[:300])
print(docs[0].metadata)

3708
THE PRIVATISATION COMMISSION ORDINANCE, 2000  
CONTENTS  
PART I.—GENERAL  
SECTIONS:  
1. Short title, extent and commencement.  
2. Definitions.  
PART II.—PRIVATISATION COMMISSION  
3. Establishment of the Commission.  
4. Location of Office.  
5. Functions and Powers of the Commission.  
PART II
{'source': 'administrator00532129aba2e10fe634ab8fbd94c50b.pdf.txt'}


In [19]:
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

  embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')


In [112]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash", 
    temperature=0,
    google_api_key='AIzaSyA2QQKd0lpWqAmaqPMPTbNtOQlz3nCK1lg'
    )

In [113]:
response = llm.invoke("Explain the purpose of the Privatisation Commission Ordinance 2000 in 2 sentences.")
print(response.content)

The Privatisation Commission Ordinance 2000 established the Privatisation Commission as a statutory body in Pakistan. Its primary purpose was to provide the legal framework and authority for the planning, management, and implementation of the government's privatization program for state-owned enterprises.


In [116]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

In [117]:
query = "What are the powers of the Privatisation Commission?"
result = qa(query)

print("Answer:", result["result"])
print("Sources:", [doc.metadata["source"] for doc in result["source_documents"]])

  result = qa(query)


Answer: Based on the provided text, the full list of powers of the Privatisation Commission from Section 5 is not available. However, the document does mention several functions and responsibilities, which imply powers:

*   **Carry out the privatisation programme:** Subject to Cabinet approval, the Commission shall carry out the privatization programme in the prescribed manner (Section 22).
*   **Advertise privatisation:** In consultation with the Federal Government and concerned enterprises, the Commission shall give notice of its intent to privatise (Section 23).
*   **Publicize activities:** Publicize the activities of the privatisation programme.
*   **Propose regulatory framework:** Propose a regulatory framework, including the establishment and strengthening of regulatory authorities, to the Cabinet for independent and fair regulation of each industry sector falling within the purview of the privatisation programme.
*   **Advise on regulatory appointments:** Advise the Federal G