In [281]:
import fitz  # PyMuPDF



def extract_text_from_pdf(pdf_path, start_page):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(start_page - 1, len(doc)):  # Adjusting for 0-based index
        page = doc.load_page(page_num)
        text += page.get_text()
    return text





In [282]:
# Extract text starting from page 14
ipc = extract_text_from_pdf("H:\RV\Work\py\Similarity\IPC.pdf",14)
bnss = extract_text_from_pdf("H:\RV\Work\py\Similarity\BNSS.pdf",14)
bpc = extract_text_from_pdf("H:\RV\Work\py\Similarity\BPC.pdf",15)



In [283]:
bpc[:500]

' \n15 \n \nREVISED ORDINANCES OF THE BRITISH INDIAN OCEAN \nTERRITORY \n \nTHE PENAL CODE 1981 \n \nCHAPTER C.2 \n \n \nAn Ordinance to amend and consolidate the Code of Criminal Law. \n \n \nPART I \n \nGENERAL PROVISIONS \n \nCHAPTER I \n \nPRELIMINARY \n \n \nCitation. \n \n1. \nThis Ordinance may be cited as the Penal Code 1981, ROBIOT c.C.2., \nand within this Ordinance it is referred to as “this Code”. \n \nDeleted on revision. \n \n2. \nDeleted on revision. \n \nSaving of certain laws. \n \n3. \nNothing in this Code shall af'

In [284]:
import re


def preprocess_text(text):
    
  
     # Remove unnecessary white spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Repeal sections marked with "rep." or "[Repealed.]"
    text = re.sub(r'Section \d+.*?\s*rep\..*?(?=Section|$)', '', text, flags=re.DOTALL)
    text = re.sub(r'Section \d+\.\s*\[Repealed\.\].*?(?=Section|$)', '', text, flags=re.DOTALL)
    
    text = re.sub(r'\d+\*', '', text)
    
    # Remove square brackets but keep the content inside
    text = re.sub(r'\[(.*?)\]', r'\1', text)
    
    
    # Replace terms
    text = re.sub(r'Code of Criminal Procedure \(Amendment\) Act \(\d+\)', '#', text)
    text = re.sub(r'Indian Penal Code', '$', text)
    
     # Replace all numbering
    text = re.sub(r'\(\w*\)|\(\d+\w*\)|\d+\w*|\w\.', '', text)
    text = re.sub(r'-+', '', text)
    
    return text

In [285]:
def preprocess_text_bnss(text):
    
  
     # Remove unnecessary white spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Repeal sections marked with "rep." or "[Repealed.]"
    text = re.sub(r'Section \d+.*?\s*rep\..*?(?=Section|$)', '', text, flags=re.DOTALL)
    text = re.sub(r'Section \d+\.\s*\[Repealed\.\].*?(?=Section|$)', '', text, flags=re.DOTALL)

    text = re.sub(r'\d+\*', '', text)
    
    text = re.sub(r'\d+\.\s*', '', text)

    # Remove the word 'illustration'
    text = re.sub(r'\billustration\b', '', text, flags=re.IGNORECASE)
    
    
#     # Remove square brackets but keep the content inside
#     text = re.sub(r'\[(.*?)\]', r'\1', text)
    
    
    # Replace terms
    text = re.sub(r'Bharatiya Nagarik Suraksha Sanhita, 2023', '#', text)
    text = re.sub(r' Bharatiya Nyaya Sanhita, 2023', '$', text)
    
    # Replace all numbering
    text = re.sub(r'\(\w*\)|\(\d+\w*\)|\d+\w*|\w\.', '', text)
    text = re.sub(r'-+', '', text)
    
    return text


In [286]:
def preprocess_text_bpc(text):
    
  
     # Remove unnecessary white spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Repeal sections marked with "rep." or "[Repealed.]"
    text = re.sub(r'Section \d+.*?\s*rep\..*?(?=Section|$)', '', text, flags=re.DOTALL)
    text = re.sub(r'Section \d+\.\s*\[Repealed\.\].*?(?=Section|$)', '', text, flags=re.DOTALL)

    text = re.sub(r'\d+\*', '', text)
    
    text = re.sub(r'\d+\.\s*', '', text)

    # Remove the word 'illustration'
    text = re.sub(r'\billustration\b', '', text, flags=re.IGNORECASE)
    
    
#     # Remove square brackets but keep the content inside
#     text = re.sub(r'\[(.*?)\]', r'\1', text)
    
    
    # Replace all numbering
    text = re.sub(r'\(\w*\)|\(\d+\w*\)|\d+\w*|\w\.', '', text)
    text = re.sub(r'-+', '', text)
    
    return text


In [287]:
def write_text_to_file(filename, string_to_write):
    # Open a file in write mode
    file = open(filename, "w")

    file.write(string_to_write)

    # Close the file
    file.close()

In [288]:
bpc[:10000]

' \n15 \n \nREVISED ORDINANCES OF THE BRITISH INDIAN OCEAN \nTERRITORY \n \nTHE PENAL CODE 1981 \n \nCHAPTER C.2 \n \n \nAn Ordinance to amend and consolidate the Code of Criminal Law. \n \n \nPART I \n \nGENERAL PROVISIONS \n \nCHAPTER I \n \nPRELIMINARY \n \n \nCitation. \n \n1. \nThis Ordinance may be cited as the Penal Code 1981, ROBIOT c.C.2., \nand within this Ordinance it is referred to as “this Code”. \n \nDeleted on revision. \n \n2. \nDeleted on revision. \n \nSaving of certain laws. \n \n3. \nNothing in this Code shall affect – \n \n(a) subject to section 326 of this Code, the liability, trial or \npunishment of a person for an offence against the English \ncommon law or against any law in force in the Territory other \nthan this Code;  \n \n(b) the liability of a person to be tried or punished for an offence \nunder the provisions of any law in force in the Territory relating \nto the jurisdiction of the courts of the Territory in respect of acts \ndone beyond the ordinary 

In [289]:
ipc_preprocess = preprocess_text(ipc)

In [290]:
bnss_preprocess = preprocess_text_bnss(bnss)

In [291]:
bpc_preprocess = preprocess_text_bpc(bpc)

In [292]:
write_text_to_file('H:\\RV\\Work\\py\\Similarity\\ipc.txt',ipc)
write_text_to_file('H:\\RV\\Work\\py\\Similarity\\bnss.txt',bnss)
write_text_to_file('H:\\RV\\Work\\py\\Similarity\\bpc.txt',bpc)

write_text_to_file('H:\\RV\\Work\\py\\Similarity\\ipc_preprocess.txt',ipc_preprocess)
write_text_to_file('H:\\RV\\Work\\py\\Similarity\\bnss_preprocess.txt',bnss_preprocess)
write_text_to_file('H:\\RV\\Work\\py\\Similarity\\bpc_preprocess.txt',bpc_preprocess)

In [293]:
bpc_preprocess[:6000]

'  REVISED ORDINANCES OF THE BRITISH INDIAN OCEAN TERRITORY THE PENAL CODE  CHAPTER  An Ordinance to amend and consolidate the Code of Criminal La PART I GENERAL PROVISIONS CHAPTER I PRELIMINARY Citatio This Ordinance may be cited as the Penal Code , ROBIOT , and within this Ordinance it is referred to as “this Code”. Deleted on revisio Deleted on revisio Saving of certain law Nothing in this Code shall affect –  subject to section  of this Code, the liability, trial or punishment of a person for an offence against the English common law or against any law in force in the Territory other than this Code;  the liability of a person to be tried or punished for an offence under the provisions of any law in force in the Territory relating to the jurisdiction of the courts of the Territory in respect of acts done beyond the ordinary jurisdiction of such courts;  the power of any court to punish a person for contempt of such Court;  the liability or trial of a person, or the punishment of a p

In [294]:
def tokenize_sentences(text):
    # Split text into sentences using regular expressions
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

def find_matching_sentences(sentences1, sentences2):
    # Convert second list of sentences to a set for faster lookup
    sentences2_set = set(sentences2)
    
    # Find matches and unmatched sentences
    matches = []
    unmatched = []
    for sentence in sentences1:
        if sentence in sentences2_set:
            matches.append(sentence)
        else:
            unmatched.append(sentence)
    
    return matches, unmatched

def calculate_match_percentage(total_sentences, matched_sentences):
    if total_sentences == 0:
        return 0.0
    return (len(matched_sentences) / total_sentences) * 100

In [304]:

bnss_sentences = tokenize_sentences(bnss_preprocess)
ipc_sentences = tokenize_sentences(ipc_preprocess)



In [306]:
# Find matching sentences
matching_sentences, unmatched_sentences = find_matching_sentences(bnss_sentences, ipc_sentences)


# Calculate match percentage
match_percentage = calculate_match_percentage(len(bnss_sentences), matching_sentences)

print(f"Total sentences in BNSS: {len(bnss_sentences)}")
print(f"Matching sentences: {len(matching_sentences)}")
print(f"Match percentage: {match_percentage:.2f}%")

Total sentences in BNSS: 77
Matching sentences: 0
Match percentage: 0.00%


In [297]:
matching_sentences = find_matching_sentences(bnss_sentences, ipc_sentences)


In [298]:
print("\nUnmatched Sentences:")
for sentence in unmatched_sentences:
#    print(f"- {sentence}")
    continue


Unmatched Sentences:


In [299]:
for sentence in matching_sentences:
#    print(f"- {sentence}")
    continue

In [303]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(doc1, doc2):
    # Initialize the TF-IDF Vectorizer
    vectorizer = CountVectorizer(stop_words="english")
    
    # Fit and transform the documents into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    
    # Compute the cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    
    return similarity_matrix[0][0]

    

similarity = compute_cosine_similarity(ipc_preprocess, bnss_preprocess)
print(f"Cosine Similarity between ipc and bnss: {similarity}")

similarity = compute_cosine_similarity(ipc_preprocess, bpc_preprocess)
print(f"Cosine Similarity between ipc and bpc: {similarity}")

similarity = compute_cosine_similarity(bnss_preprocess, bpc_preprocess)
print(f"Cosine Similarity between bnss and bpc: {similarity}")

Cosine Similarity between ipc and bnss: 0.9362999309766589
Cosine Similarity between ipc and bpc: 0.6831075012241392
Cosine Similarity between bnss and bpc: 0.6647842693681364


In [301]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(doc1, doc2):
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words="english")
    
    # Fit and transform the documents into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    
    # Compute the cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    return similarity_matrix[0][0]

similarity = compute_cosine_similarity(ipc_preprocess, bnss_preprocess)
print(f"Cosine tfidf Similarity between ipc and bnss: {similarity}")

similarity = compute_cosine_similarity(ipc_preprocess, bpc_preprocess)
print(f"Cosine tfidf Similarity between ipc and bpc: {similarity}")

similarity = compute_cosine_similarity(bnss_preprocess, bpc_preprocess)
print(f"Cosine tfidf Similarity between bnss and bpc: {similarity}")


Cosine tfidf Similarity between ipc and bnss: 0.9221233253341317
Cosine tfidf Similarity between ipc and bpc: 0.6718575891025383
Cosine tfidf Similarity between bnss and bpc: 0.6482083092861881
