In [66]:
import fitz  # PyMuPDF



def extract_text_from_pdf(pdf_path, start_page, end_page=""):
    doc = fitz.open(pdf_path)
    if(end_page==""):
        end_page=len(doc)
    text = ""
    for page_num in range(start_page - 1, end_page):  # Adjusting for 0-based index
        page = doc.load_page(page_num)
        text += page.get_text()
    return text





In [67]:
# Extract text starting from page 14
ipc = extract_text_from_pdf("C:\\RV\\tempdata\python\ML\Similarity\IPC.pdf",14,)
bnss = extract_text_from_pdf("C:\\RV\\tempdata\python\ML\Similarity\BNSS.pdf",14,)
bpc = extract_text_from_pdf("C:\\RV\\tempdata\python\ML\Similarity\BPC.pdf",15,)



In [68]:
import re


def preprocess_text(text):
    
  
# Remove unnecessary white spaces and combining new lines
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
#     # Repeal sections marked with "rep." or "[Repealed.]"
#     text = re.sub(r'Section \d+.*?\s*rep\..*?(?=Section|$)', '', text, flags=re.DOTALL)
#     text = re.sub(r'Section \d+\.\s*\[Repealed\.\].*?(?=Section|$)', '', text, flags=re.DOTALL)
    
#     text = re.sub(r'\d+\*', '', text)
    
#     # Remove square brackets but keep the content inside
#     text = re.sub(r'\[(.*?)\]', r'\1', text)
    
    
#     # Replace terms
#     text = re.sub(r'Code of Criminal Procedure \(Amendment\) Act \(\d+\)', '#', text)
#     text = re.sub(r'Indian Penal Code', '$', text)
    
     # Replace all numbering
#    text = re.sub(r'\(\w*\)|\(\d+\w*\)|\d+\w*|\w\.', '.', text)
#    text = re.sub(r'-+', '', text)
    
    return text

In [69]:
def write_text_to_file(filename, string_to_write):
    # Open a file in write mode
    file = open(filename, "w")

    file.write(string_to_write)

    # Close the file
    file.close()

In [70]:
ipc_preprocess = preprocess_text(ipc)
bnss_preprocess = preprocess_text(bnss)
bpc_preprocess = preprocess_text(bpc)
write_text_to_file('C:\\RV\\tempdata\python\ML\Similarity\ipc.txt',ipc)
write_text_to_file('C:\\RV\\tempdata\python\ML\Similarity\ipc_preprocess.txt',ipc_preprocess)

In [71]:
def convert_to_sentences(text):
    result = text.split('.')
    trimmed_result = [string.strip() for string in result] # Trim
    filtered_list = [string for string in trimmed_result if re.search('[a-zA-Z]', string)] # All independent special and numbers will be removed. 
    remove_firstspecial_list = [string[1:].strip() if re.search('[^a-zA-Z0-9]', string[0]) else string.strip() for string in filtered_list if re.search('[a-zA-Z]', string)] # Special chatacter in first char is removed
    remove_numbering = [re.sub(r"^(?:\([a-zA-Z]\)|\[a-zA-Z]\)|\d+\)|\(\d+\)|\[\d+\]|\[[a-zA-Z]\]|[a-zA-Z]\)|[a-zA-Z]\()[^\w\s]*", '', item.strip()) for item in remove_firstspecial_list] # Numbering in beginning of the list is removed
    trimmed_result_A = [string.strip() for string in remove_numbering] # Trim
    return (trimmed_result_A)

In [72]:
def write_list_to_file(file, content):
    with open(file, "w") as file:
        for item in content:
            file.write(item + "\n")

In [73]:
ipc_sentences = convert_to_sentences(ipc_preprocess)
bnss_sentences = convert_to_sentences(bnss_preprocess)
bpc_sentences = convert_to_sentences(bpc_preprocess)

In [74]:
def add_token_to_sentence(sentences):
    string_with_token = ""
    for sentence in sentences:
        string_with_token = string_with_token + '[CLS]' + sentence + '[SEP]'
    return string_with_token

In [75]:
ipc_sentences_with_token = add_token_to_sentence(ipc_sentences)
bnss_sentences_with_token = add_token_to_sentence(bnss_sentences)
bpc_sentences_with_token = add_token_to_sentence(bpc_sentences)
#ipc_sentences_with_tokens = 

In [76]:
write_list_to_file('C:\\RV\\tempdata\\python\\ML\Similarity\\ipc_sentences.txt',ipc_sentences)
write_list_to_file('C:\\RV\\tempdata\\python\\ML\Similarity\\bnss_sentences.txt',bnss_sentences)
write_list_to_file('C:\\RV\\tempdata\\python\\ML\Similarity\\bpc_sentences.txt',bpc_sentences)

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

def compute_cosine_similarity(doc1, doc2):
    # Initialize the TF-IDF Vectorizer
    vectorizer = MultiLabelBinarizer()
    
    tfidf_matrix_fit = vectorizer.fit([doc1, doc2])

    tfidf_matrix_transform = tfidf_matrix_fit.transform([doc1, doc2])
    # Compute the cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix_transform[0:1], tfidf_matrix_transform[1:2])

    df = pd.DataFrame(
    tfidf_matrix_transform,
    columns=tfidf_matrix_fit.classes_,
    index=["IPC", "BNSS"]
    )
#    print(df)
    return similarity_matrix[0][0]

    

similarity = compute_cosine_similarity(ipc_sentences, bnss_sentences)
print(f"Cosine Similarity(sentences) between ipc and bnss: {similarity}")

similarity = compute_cosine_similarity(ipc_sentences, bpc_sentences)
print(f"Cosine Similarity(sentences) between ipc and bpc: {similarity}")

similarity = compute_cosine_similarity(bnss_sentences, bpc_sentences)
print(f"Cosine Similarity(sentences) between bnss and bpc: {similarity}")

Cosine Similarity(sentences) between ipc and bnss: 0.32755787495583294
Cosine Similarity(sentences) between ipc and bpc: 0.010397812270505256
Cosine Similarity(sentences) between bnss and bpc: 0.010096575090103047


In [78]:
import torch
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

In [86]:
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

  return self.fget.__get__(instance, owner)()


In [87]:
# Encode the sentences and convert them into tensors
ipc_sentences_tokenizer = tokenizer(ipc_sentences_with_token, return_tensors="pt", truncation=True, padding=True)
bnss_sentences_tokenizer = tokenizer(bnss_sentences_with_token, return_tensors="pt", truncation=True, padding=True)
bpc_sentences_tokenizer = tokenizer(bpc_sentences_with_token, return_tensors="pt", truncation=True, padding=True)

In [88]:
# Get the embeddings for the sentences
with torch.no_grad():
    ipc_sentences_embedding = model(**ipc_sentences_tokenizer).last_hidden_state.mean(dim=1)
    bnss_sentences_embedding = model(**bnss_sentences_tokenizer).last_hidden_state.mean(dim=1)
    bpc_sentences_embedding = model(**bpc_sentences_tokenizer).last_hidden_state.mean(dim=1)

In [94]:
cos_similarity = cosine_similarity(ipc_sentences_embedding, bnss_sentences_embedding)
print("Cosine Similarity(semantic) between ipc and bnss: " + str(cos_similarity[0,0]))

cos_similarity = cosine_similarity(ipc_sentences_embedding, bpc_sentences_embedding)
print("Cosine Similarity(semantic) between ipc and bpc: " + str(cos_similarity[0,0]))

cos_similarity = cosine_similarity(bnss_sentences_embedding, bpc_sentences_embedding)
print("Cosine Similarity(semantic) between bnss and bpc: " + str(cos_similarity[0,0]))


Cosine Similarity(semantic) between ipc and bnss: 0.9756323
Cosine Similarity(semantic) between ipc and bpc: 0.950667
Cosine Similarity(semantic) between bnss and bpc: 0.96584374
