# Read PDF Policies

In [1]:
import os
import re
import pandas as pd
from pdf2image import convert_from_path
import pytesseract

# Set up Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Function to clean and preprocess text
def clean_text(text):
    text = re.sub(r'\n+', '\n', text)  # Remove extra newlines
    text = re.sub(r'[^a-zA-Z0-9\s.,]', '', text)  # Remove unwanted characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path)
    text = ""
    for page in pages:
        page_text = pytesseract.image_to_string(page)
        cleaned_text = clean_text(page_text)
        text += cleaned_text + "\n"
    return text

# Function to preprocess and extract policy text from PDF files
def preprocess_policies(pdf_directory):
    data = []
    for pdf_file in os.listdir(pdf_directory):
        if pdf_file.endswith('.pdf'):
            policy_text = extract_text_from_pdf(os.path.join(pdf_directory, pdf_file))
            data.append({
                "policy_name": pdf_file,
                "text": policy_text,
            })
    return pd.DataFrame(data)

# Example usage
pdf_directory = "C:\\Users\\Admin\\Desktop\\Polices project"
policy_df = preprocess_policies(pdf_directory)

# Display the preprocessed data
print(policy_df.head())


         policy_name                                               text
0  BISP_ACT_2010.pdf  REGISTERED No 302 L.7646 EXTRAORDINARY PUBLISH...


In [2]:
from transformers import pipeline

# Create a summarization pipeline (it can handle sentence tokenization)
summarizer = pipeline("summarization")

text = "This is a sentence. And here is another one."
# This is a hack; you might want to use a different approach to tokenize if you want precise control.
sentences = text.split('. ')  # Simple split for demonstration
print(sentences)


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.



['This is a sentence', 'And here is another one.']


# Data Cleaning

In [3]:
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load pre-trained tokenizer and model for token classification (NER model can help distinguish meaningful words)
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

# Set up a pipeline for token classification (used here as a form of identifying important tokens)
nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove unwanted characters (punctuation, numbers, special characters)
    return text.strip()

# Function to extract meaningful words using transformers (token classification pipeline)
def extract_meaningful_words(text):
    tokens = nlp(text)  # Token classification for named entities (proper nouns, etc.)
    meaningful_words = [token['word'] for token in tokens if token['entity_group'] in ['ORG', 'PER', 'LOC', 'MISC']]  # Keep important entities
    return ' '.join(meaningful_words)

# Function to clean and normalize policies in DataFrame
def normalize_policy_text_with_transformer(policy_df):
    policy_df['normalized_text'] = policy_df['text'].apply(lambda x: extract_meaningful_words(clean_text(x)))
    return policy_df

# Example usage
# Assuming `policy_df` is your DataFrame with policies
normalized_policy_df = normalize_policy_text_with_transformer(policy_df)

# Display normalized data
print(normalized_policy_df[['policy_name', 'normalized_text']].head())


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


         policy_name                                    normalized_text
0  BISP_ACT_2010.pdf  ##lamabad ##lamabad islam pakistan pakistan ##...


In [4]:
from sentence_transformers import SentenceTransformer

# Initialize the model
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Function to clean, tokenize, and embed text in DataFrame
def clean_tokenize_and_embed(policy_df):
    # Step 1: Clean and extract meaningful words
    policy_df['normalized_text'] = policy_df['text'].apply(lambda x: extract_meaningful_words(clean_text(x)))

    # Step 2: Tokenize into sentences
    policy_df['sentences'] = policy_df['normalized_text'].apply(lambda x: x.split('.'))

    # Step 3: Generate embeddings for each sentence
    policy_df['embeddings'] = policy_df['sentences'].apply(lambda sents: sentence_model.encode(sents))

    return policy_df


# Example usage
# Assuming `policy_df` is your DataFrame with policies
normalized_and_embedded_policy_df = clean_tokenize_and_embed(policy_df)

# Display a sample with embeddings
print(normalized_and_embedded_policy_df[['policy_name', 'sentences', 'embeddings']].head())



         policy_name                                          sentences  \
0  BISP_ACT_2010.pdf  [##lamabad ##lamabad islam pakistan pakistan #...   

                                          embeddings  
0  [[-0.0040787472, 0.08183558, -0.0054995907, 0....  


# Tokenization and Embadding

In [5]:
# Check the size of the DataFrame
print("Number of rows in policy_df:", len(policy_df))

# Clean, tokenize, and embed the policy text
normalized_and_embedded_policy_df = clean_tokenize_and_embed(policy_df)

# Print the first few rows of the processed DataFrame for inspection
print(normalized_and_embedded_policy_df[['policy_name', 'sentences', 'embeddings']].head())


Number of rows in policy_df: 1
         policy_name                                          sentences  \
0  BISP_ACT_2010.pdf  [##lamabad ##lamabad islam pakistan pakistan #...   

                                          embeddings  
0  [[-0.0040787472, 0.08183558, -0.0054995907, 0....  


# Text Embedding for Similarity Matching

In [6]:
# Ensure there are enough rows before trying to access
if len(policy_df) > 0:
    # Clean, tokenize, and embed the policy text
    normalized_and_embedded_policy_df = clean_tokenize_and_embed(policy_df)

    # Loop through the first 3 rows (or fewer if the DataFrame has less than 3 rows)
    for i in range(min(3, len(normalized_and_embedded_policy_df))):
        # Print the embeddings for the current row's sentences
        print(normalized_and_embedded_policy_df.iloc[i]['embeddings'])
else:
    print("policy_df is empty, cannot access rows.")


[[-4.07874724e-03  8.18355829e-02 -5.49959065e-03  2.32169330e-02
  -9.24994610e-03  5.15686981e-02  1.07717179e-01 -3.99912633e-02
   4.23175916e-02 -4.27789837e-02  2.36954144e-03 -1.82064977e-02
   9.62767452e-02 -2.63014324e-02 -1.66686326e-02  6.74588652e-03
  -3.04104518e-02 -7.29664182e-03  2.81517906e-03 -1.29142091e-01
  -7.13488534e-02  8.80614296e-02  5.23661636e-02  1.51333250e-02
  -7.58751761e-03  3.17883166e-03 -1.23406539e-03 -4.04453687e-02
  -1.15408683e-02 -2.49783359e-02 -5.01463236e-03  1.60265081e-02
   4.38063070e-02  8.84050597e-03 -3.42901936e-03  2.65517849e-02
  -5.75184859e-02  5.86089678e-02  6.17638091e-03 -2.86539607e-02
   7.88919777e-02 -4.52125855e-02  4.03611809e-02  1.00598177e-02
  -9.23297554e-03  5.77678904e-02 -9.26566198e-02 -1.61950439e-02
  -1.03548123e-02  6.88983649e-02 -1.57269333e-02 -1.33875366e-02
   4.16765586e-02 -7.09482282e-03  6.43926039e-02 -1.34035498e-01
  -1.55991232e-02 -7.82631561e-02  7.57582160e-03  4.59944382e-02
  -2.31375

In [7]:
# Ensure the policies are being read correctly
print("Files in directory:", os.listdir(pdf_directory))

# Extract text from a sample PDF to verify
sample_pdf = os.path.join(pdf_directory, os.listdir(pdf_directory)[0])
sample_text = extract_text_from_pdf(sample_pdf)
print("Sample text from PDF:", sample_text)

# Clean, tokenize, and embed the sample text using the previous function
sample_df = pd.DataFrame([{'policy_name': os.listdir(pdf_directory)[0], 'text': sample_text}])
normalized_and_embedded_sample_df = clean_tokenize_and_embed(sample_df)

# Check the tokenization and embedding process for the sample
print("Tokenized sentences:", normalized_and_embedded_sample_df.iloc[0]['sentences'])
print("Tokenized text embeddings:", normalized_and_embedded_sample_df.iloc[0]['embeddings'])


Files in directory: ['BISP_ACT_2010.pdf', 'fine_tuned_gpt2', 'flask', 'Gen Ai.ipynb', 'Gen pipline.txt', 'logs', 'policy_embeddings.json', 'results', 'Untitled-1.ipynb', 'val_policies.txt']
Sample text from PDF: registered no  

l
extraordinary
published by authority

islamabad wednesday august  

part i
acts ordinances presidents orders and regulations
senate secretariat
islamabad the th august 

the following act of majliseshoora parliament received the assent of
the president on th august  is hereby published for general information

act no xviii of 

an act to provide for establishment of the benazir income support
programme

whereas it is expedient to provide for establishment of benazir income
support programme and to regulate its affairs and matters connected therewith or
incidental thereto

and whereas it is desirable to provide financial assistance and other social
protection and safety net measures to economically distressed persons and families

and whereas under the princip

# Embeddings and policies saved to policy_embeddings.json

In [8]:
import json
import os

# Path to store the embeddings and policy text
output_file = "policy_embeddings.json"

# Assuming we have a list of policies from the directory
policies = os.listdir(pdf_directory)
embeddings = []  # List to store the embeddings

# Extract, clean, tokenize, and embed text for all PDFs
for policy_file in policies:
    policy_path = os.path.join(pdf_directory, policy_file)
    
    if policy_file.endswith(".pdf"):
        # Extract text from the PDF
        policy_text = extract_text_from_pdf(policy_path)
        
        # Create a DataFrame for this single policy for further processing
        sample_df = pd.DataFrame([{'policy_name': policy_file, 'text': policy_text}])
        
        # Clean, tokenize, and embed the policy text using the previous function
        processed_df = clean_tokenize_and_embed(sample_df)
        
        # Get the tokenized sentences and embeddings
        tokenized_sentences = processed_df.iloc[0]['sentences']
        policy_embedding = processed_df.iloc[0]['embeddings']
        
        # Store the embeddings, tokenized sentences, and policy text
        embeddings.append({
            'policy_file': policy_file,
            'policy_text': policy_text,
            'tokenized_sentences': tokenized_sentences,
            'embedding': policy_embedding.tolist()  # Convert tensor/array to list for JSON storage
        })

# Save embeddings and policy text to a JSON file
with open(output_file, 'w') as f:
    json.dump(embeddings, f)

print(f"Embeddings and policies saved to {output_file}")


Embeddings and policies saved to policy_embeddings.json


#  Set Up a Similarity Matching System

In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to retrieve the most similar policies based on query text
def retrieve_similar_policies(query_text, top_k=3):
    # Clean and tokenize the query text before embedding
    query_df = pd.DataFrame([{'policy_name': 'query', 'text': query_text}])
    processed_query_df = clean_tokenize_and_embed(query_df)
    
    # Get the embeddings for the query text (after processing)
    query_embedding = processed_query_df.iloc[0]['embeddings']

    # Ensure query_embedding is 2D (shape: 1, embedding_dim)
    if query_embedding.ndim == 1:
        query_embedding = query_embedding.reshape(1, -1)

    # Load stored policy embeddings from JSON file
    with open(output_file, 'r') as f:
        policy_data = json.load(f)

    # Extract embeddings from policy_data
    policy_embeddings = np.array([np.array(item['embedding']) for item in policy_data])

    # Ensure policy_embeddings is 2D
    if policy_embeddings.ndim == 3:
        policy_embeddings = policy_embeddings.squeeze(1)

    # Calculate cosine similarity between the query and stored policies
    similarities = cosine_similarity(query_embedding, policy_embeddings)[0]

    # Get top K most similar policies based on cosine similarity
    top_k_indices = similarities.argsort()[-top_k:][::-1]

    # Return the most similar policies along with their similarity score
    similar_policies = [(policy_data[idx]['policy_file'], policy_data[idx]['policy_text'], similarities[idx]) for idx in top_k_indices]
    return similar_policies


# Model Training & Fine-Tuning Strategy

In [10]:
policy_text

'registered no  \n\nl\nextraordinary\npublished by authority\n\nislamabad wednesday august  \n\npart i\nacts ordinances presidents orders and regulations\nsenate secretariat\nislamabad the th august \n\nthe following act of majliseshoora parliament received the assent of\nthe president on th august  is hereby published for general information\n\nact no xviii of \n\nan act to provide for establishment of the benazir income support\nprogramme\n\nwhereas it is expedient to provide for establishment of benazir income\nsupport programme and to regulate its affairs and matters connected therewith or\nincidental thereto\n\nand whereas it is desirable to provide financial assistance and other social\nprotection and safety net measures to economically distressed persons and families\n\nand whereas under the principles of policy as given in the constitution of\nthe islamic republic of pakistan the state is obliged to promote social and economic\nwellbeing of the people and to provide basic neces

In [11]:
import json
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

# Load your stored policy data
output_file = 'policy_embeddings.json'  # Replace with your actual file name
with open(output_file, 'r') as f:
    policy_data = json.load(f)

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

# Prepare the policy text for fine-tuning
# Here we extract, clean, and tokenize the policy text before passing it to GPT-2
policy_texts = [item['policy_text'] for item in policy_data]

# Create a small DataFrame to process the policies before tokenizing
policy_df = pd.DataFrame([{'policy_name': item['policy_file'], 'text': item['policy_text']} for item in policy_data])

# Use the previous function to clean and tokenize the policies (sentences not needed here for GPT-2 training)
processed_policy_df = policy_df.copy()
processed_policy_df['text'] = policy_df['text'].apply(lambda x: clean_text(x))

# Re-extract the cleaned policy texts
cleaned_policy_texts = processed_policy_df['text'].tolist()

# Create a dataset from the cleaned policy texts
dataset = Dataset.from_dict({"text": cleaned_policy_texts})

# Tokenize the data
def tokenize_function(examples):
    # Tokenize input text
    output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    # Set labels to the input_ids for loss calculation
    output['labels'] = output['input_ids'].copy()
    return output

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Fine-tuning arguments for GPT-2
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=25,  # Customize based on your needs
    per_device_train_batch_size=10,  # Adjust batch size
    logging_dir='./logs',
    logging_steps=15,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Fine-tune the GPT-2 model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/25 [00:00<?, ?it/s]

{'loss': 2.9433, 'grad_norm': 7.395753860473633, 'learning_rate': 2e-05, 'epoch': 15.0}
{'train_runtime': 502.5009, 'train_samples_per_second': 0.05, 'train_steps_per_second': 0.05, 'train_loss': 2.457630615234375, 'epoch': 25.0}


('fine_tuned_gpt2\\tokenizer_config.json',
 'fine_tuned_gpt2\\special_tokens_map.json',
 'fine_tuned_gpt2\\vocab.json',
 'fine_tuned_gpt2\\merges.txt',
 'fine_tuned_gpt2\\added_tokens.json')

# Generate Policy Content and Provide References

In [12]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sentence_transformers import SentenceTransformer
import numpy as np
import json

# Load GPT-2 tokenizer and model for policy generation
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Load Sentence-BERT for embedding retrieval
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to retrieve similar policies using embeddings
def retrieve_similar_policies(query_text, top_k=3):
    # Encode the query text using the Sentence-BERT model
    query_embedding = embedding_model.encode([query_text])

    # Ensure query_embedding is 2D (shape: 1, embedding_dim)
    if query_embedding.ndim == 1:
        query_embedding = query_embedding[np.newaxis, :]

    # Load the stored policy embeddings from file
    with open('policy_embeddings.json', 'r') as f:
        policy_data = json.load(f)
    
    # Extract the policy embeddings and text
    policy_embeddings = np.array([item['embedding'] for item in policy_data])
    policy_texts = [item['policy_text'] for item in policy_data]
    policy_files = [item['policy_file'] for item in policy_data]

    # Compute cosine similarities
    similarities = np.dot(policy_embeddings, query_embedding.T).flatten()

    # Get top K most similar policies
    top_k_indices = similarities.argsort()[-top_k:][::-1]
    similar_policies = [(policy_files[i], policy_texts[i], similarities[i]) for i in top_k_indices]

    return similar_policies

# Function to generate a policy with references
def generate_policy_with_references(prompt):
    # Clean and process the prompt using the previously defined function
    cleaned_prompt = clean_text(prompt)  # Clean the prompt text
    # Generate new policy content based on the cleaned prompt
    inputs = tokenizer(cleaned_prompt, return_tensors='pt')
    outputs = model.generate(inputs['input_ids'], max_length=512)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Retrieve relevant existing policies
    similar_policies = retrieve_similar_policies(cleaned_prompt)

    return generated_text, similar_policies

# Example usage
prompt = "policy regarding income support for families."
generated_policy, references = generate_policy_with_references(prompt)

print("Generated Policy:\n", generated_policy)
print("\nRelevant References:")
for i, (policy_file, policy_text, similarity) in enumerate(references):
    print(f"Top {i+1} reference policy: {policy_file} with similarity {similarity:.4f}")
    print(f"Excerpt: {policy_text[:500]}\n")  # Print the first 500 characters of the policy text


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Policy:
 policy regarding income support for families with children.

The report also found that the government's policy of "reform" has been "unworkable" and that the government's "policy of 'reform' has been 'unworkable'."

The report also found that the government's policy of "reform" has been "unworkable" and that the government's "policy of 'reform' has been 'unworkable'."

The report also found that the government's policy of "reform" has been "unworkable" and that the government's "policy of 'reform' has been 'unworkable'."

The report also found that the government's policy of "reform" has been "unworkable" and that the government's "policy of 'reform' has been 'unworkable'."

The report also found that the government's policy of "reform" has been "unworkable" and that the government's "policy of 'reform' has been 'unworkable'."

The report also found that the government's policy of "reform" has been "unworkable" and that the government's "policy of 'reform' has been 

In [13]:
pip install --upgrade evaluate





In [14]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load('rouge')

# Function to evaluate generated text against reference policies
def evaluate_policy(generated_text, reference_texts):
    # Compute ROUGE scores
    results = rouge.compute(predictions=[generated_text], references=reference_texts)
    return results

# Example usage
# Ensure that references were retrieved before this evaluation
if references:
    reference_texts = [ref[1] for ref in references]  # Extract reference policies' text
    evaluation_results = evaluate_policy(generated_policy, reference_texts)

    # Print the evaluation results
    print("Evaluation Results:", evaluation_results)
else:
    print("No references found for evaluation.")


Evaluation Results: {'rouge1': 0.07240437158469945, 'rouge2': 0.003417634996582365, 'rougeL': 0.06489071038251366, 'rougeLsum': 0.07240437158469945}


In [15]:
import evaluate

# Load BLEU and METEOR metrics
bleu = evaluate.load('bleu')
meteor = evaluate.load('meteor')

# Example mock reference for evaluation purposes
mock_reference = "This is an example of a reference policy text."

# Adjust your evaluation function
def evaluate_generated_policy(generated_policy):
    # Use mock references for evaluation
    results_bleu = bleu.compute(predictions=[generated_policy], references=[[mock_reference]])
    results_rouge = rouge.compute(predictions=[generated_policy], references=[[mock_reference]])
    results_meteor = meteor.compute(predictions=[generated_policy], references=[[mock_reference]])
    return results_bleu, results_rouge, results_meteor

# Example usage
evaluation_results = evaluate_generated_policy(generated_policy)
print("Evaluation Results:", evaluation_results)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Evaluation Results: ({'bleu': 0.0, 'precisions': [0.008174386920980926, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 36.7, 'translation_length': 367, 'reference_length': 10}, {'rouge1': 0.012618296529968456, 'rouge2': 0.0, 'rougeL': 0.012618296529968456, 'rougeLsum': 0.012618296529968456}, {'meteor': 0.02976190476190477})


In [16]:
pip install flask

Note: you may need to restart the kernel to use updated packages.


In [17]:
def generate_policy_with_references(input_text):
    # Clean and process the input text
    cleaned_input = clean_text(input_text)  # Clean the input text using the previously defined function
    
    # Generate new policy content based on the cleaned input
    inputs = tokenizer(cleaned_input, return_tensors='pt')
    outputs = model.generate(inputs['input_ids'], max_length=512)
    generated_policy = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Retrieve relevant existing policies using the cleaned input
    references = retrieve_similar_policies(cleaned_input)

    return generated_policy, references


In [18]:
pip install flask --upgrade


Note: you may need to restart the kernel to use updated packages.


In [19]:
import sys

# Get the current Python version
python_version = sys.version
python_version

'3.12.5 (tags/v3.12.5:ff3bc82, Aug  6 2024, 20:45:27) [MSC v.1940 64 bit (AMD64)]'