In [525]:
import fitz  # PyMuPDF



def extract_text_from_pdf(pdf_path, start_page):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(start_page - 1, len(doc)):  # Adjusting for 0-based index
        page = doc.load_page(page_num)
        text += page.get_text()
    return text





In [526]:
# Extract text starting from page 14
ipc = extract_text_from_pdf("H:\RV\Work\py\Similarity\IPC.pdf",14)
bnss = extract_text_from_pdf("H:\RV\Work\py\Similarity\BNSS.pdf",14)
bpc = extract_text_from_pdf("H:\RV\Work\py\Similarity\BPC.pdf",15)



In [527]:
import re


def preprocess_text(text):
    
  
# Remove unnecessary white spaces and combining new lines
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
#     # Repeal sections marked with "rep." or "[Repealed.]"
#     text = re.sub(r'Section \d+.*?\s*rep\..*?(?=Section|$)', '', text, flags=re.DOTALL)
#     text = re.sub(r'Section \d+\.\s*\[Repealed\.\].*?(?=Section|$)', '', text, flags=re.DOTALL)
    
#     text = re.sub(r'\d+\*', '', text)
    
#     # Remove square brackets but keep the content inside
#     text = re.sub(r'\[(.*?)\]', r'\1', text)
    
    
#     # Replace terms
#     text = re.sub(r'Code of Criminal Procedure \(Amendment\) Act \(\d+\)', '#', text)
#     text = re.sub(r'Indian Penal Code', '$', text)
    
     # Replace all numbering
#    text = re.sub(r'\(\w*\)|\(\d+\w*\)|\d+\w*|\w\.', '.', text)
#    text = re.sub(r'-+', '', text)
    
    return text

In [528]:
ipc_preprocess = preprocess_text(ipc)
bnss_preprocess = preprocess_text(bnss)
bpc_preprocess = preprocess_text(bpc)
write_text_to_file('H:\\RV\\Work\\py\\Similarity\\ipc.txt',ipc)
write_text_to_file('H:\\RV\\Work\\py\\Similarity\\ipc_preprocess.txt',ipc_preprocess)

In [529]:
def convert_to_sentences(text):
    result = text.split('.')
    trimmed_result = [string.strip() for string in result] # Trim
    filtered_list = [string for string in trimmed_result if re.search('[a-zA-Z]', string)] # All independent special and numbers will be removed. 
    remove_firstspecial_list = [string[1:].strip() if re.search('[^a-zA-Z0-9]', string[0]) else string.strip() for string in filtered_list if re.search('[a-zA-Z]', string)] # Special chatacter in first char is removed
    remove_numbering = [re.sub(r"^(?:\([a-zA-Z]\)|\[a-zA-Z]\)|\d+\)|\(\d+\)|\[\d+\]|\[[a-zA-Z]\]|[a-zA-Z]\)|[a-zA-Z]\()[^\w\s]*", '', item.strip()) for item in remove_firstspecial_list] # Numbering in beginning of the list is removed
    trimmed_result_A = [string.strip() for string in remove_numbering] # Trim
    return (trimmed_result_A)

In [530]:
def write_list_to_file(file, content):
    with open(file, "w") as file:
        for item in content:
            file.write(item + "\n")

In [531]:
ipc_sentences = convert_to_sentences(ipc_preprocess)
bnss_sentences = convert_to_sentences(bnss_preprocess)
bpc_sentences = convert_to_sentences(bpc_preprocess)

In [532]:
ipc_list_to_string = ' '.join(ipc_sentences)
bnss_list_to_string = ' '.join(bnss_sentences)
bpc_list_to_string = ' '.join(bpc_sentences)

In [533]:
write_list_to_file('H:\\RV\\Work\\py\\Similarity\\ipc_sentences.txt',ipc_sentences)
write_list_to_file('H:\\RV\\Work\\py\\Similarity\\bnss_sentences.txt',bnss_sentences)
write_list_to_file('H:\\RV\\Work\\py\\Similarity\\bpc_sentences.txt',bpc_sentences)

In [534]:
def write_text_to_file(filename, string_to_write):
    # Open a file in write mode
    file = open(filename, "w")

    file.write(string_to_write)

    # Close the file
    file.close()

In [535]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(doc1, doc2):
    # Initialize the TF-IDF Vectorizer
    vectorizer = CountVectorizer(stop_words="english")
    
    # Fit and transform the documents into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    
    # Compute the cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    
    return similarity_matrix[0][0]

    

similarity = compute_cosine_similarity(ipc_list_to_string, bnss_list_to_string)
print(f"Cosine Similarity between ipc and bnss: {similarity}")

similarity = compute_cosine_similarity(ipc_list_to_string, bpc_list_to_string)
print(f"Cosine Similarity between ipc and bpc: {similarity}")

similarity = compute_cosine_similarity(bnss_list_to_string, bpc_list_to_string)
print(f"Cosine Similarity between bnss and bpc: {similarity}")

Cosine Similarity between ipc and bnss: 0.935987644480314
Cosine Similarity between ipc and bpc: 0.6981428872604466
Cosine Similarity between bnss and bpc: 0.6823067002690311


In [536]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(doc1, doc2):
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words="english")
    
    # Fit and transform the documents into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    
    # Compute the cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    return similarity_matrix[0][0]

similarity = compute_cosine_similarity(ipc_list_to_string, bnss_list_to_string)
print(f"Cosine tfidf Similarity between ipc and bnss: {similarity}")

similarity = compute_cosine_similarity(ipc_list_to_string, bpc_list_to_string)
print(f"Cosine tfidf Similarity between ipc and bpc: {similarity}")

similarity = compute_cosine_similarity(bnss_list_to_string, bpc_list_to_string)
print(f"Cosine tfidf Similarity between bnss and bpc: {similarity}")


Cosine tfidf Similarity between ipc and bnss: 0.9178952361605763
Cosine tfidf Similarity between ipc and bpc: 0.6836043617636016
Cosine tfidf Similarity between bnss and bpc: 0.6655935703580458
