In [None]:
!pip install pymupdf
!pip install sentence-transformers


Collecting pymupdf
  Downloading PyMuPDF-1.23.26-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.22 (from pymupdf)
  Downloading PyMuPDFb-1.23.22-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.23.22 pymupdf-1.23.26
Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import fitz  # PyMuPDF

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def preprocess_text_modified(text):

    # To convert bullet points into full stops if they follow a semi-colon or start of the text
    text = re.sub(r'(?<=;)\s*●\s*', '. ', text)
    text = re.sub(r'^●\s*', '. ', text, flags=re.MULTILINE)

    # To keep numeric references and legal terminologies intact - customize as necessary
    pattern_to_keep = re.compile(r'\b17Ad-22\(e\)\(7\)\b|\bCCP\b', re.IGNORECASE)
    kept_terms = pattern_to_keep.findall(text)

    # To convert text to lowercase, except for kept_terms
    text = text.lower()

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    processed_sentences = []

    for sentence in sentences:
        # Remove punctuation while preserving intra-word dashes
        sentence = re.sub(r'(?<!\w)-|-(?!\w)', ' ', sentence)  # Replace dashes not within words
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))

        # Tokenization
        tokens = word_tokenize(sentence)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word not in stop_words]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

        processed_sentences.append(' '.join(lemmatized_tokens))

    processed_text = '. '.join(processed_sentences)

    # Reinsert numeric references and legal terminologies with original casing
    for term in kept_terms:
        term_lowercase = term.lower()
        processed_text = re.sub(re.escape(term_lowercase), term, processed_text)

    return processed_text


pdf_path1 = '/content/rule 7 doc2.pdf'
pdf_path2 = '/content/rule 19 doc 2.pdf'

# Extract and preprocess text
text1 = extract_text_from_pdf(pdf_path1)
text2 = extract_text_from_pdf(pdf_path2)

preprocessed_text1 = preprocess_text_modified(text1)
preprocessed_text2 = preprocess_text_modified(text2)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
preprocessed_text1

'principle 7 liquidity risk fmi effectively measure monitor manage liquidity risk. fmi maintain sufficient liquid resource relevant currency effect sameday appropriate intraday multiday settlement payment obligation high degree confidence wide range potential stress scenario include limited default participant affiliate would generate largest aggregate liquidity obligation fmi extreme plausible market condition. key consideration 1 fmi robust framework manage liquidity risk participant settlement bank nostro agent custodian bank liquidity provider entity. key consideration 2 fmi effective operational analytical tool identify measure monitor settlement funding flow ongoing timely basis including use intraday liquidity. key consideration 3 payment system ss including one employing dns mechanism maintain sufficient liquid resource relevant currency effect sameday settlement appropriate intraday multiday settlement payment obligation high degree confidence wide range potential stress scena

# **BERT**

In [None]:
import torch
from transformers import BertModel, BertTokenizer
from sentence_transformers import util
import numpy as np

# Initialize the tokenizer and model from Hugging Face Transformers
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the texts as PyTorch tensors
inputs1 = tokenizer(preprocessed_text1, return_tensors="pt", padding=True, truncation=True)
inputs2 = tokenizer(preprocessed_text2, return_tensors="pt", padding=True, truncation=True)

# Obtain embeddings
with torch.no_grad():  # No need to compute gradients
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)

    # Use the mean of the last layer hidden states as sentence vector representation
    sentence_embedding1 = outputs1.last_hidden_state.mean(dim=1)
    sentence_embedding2 = outputs2.last_hidden_state.mean(dim=1)

# Calculate the cosine similarity between the embeddings using util.pytorch_cos_sim
similarity = util.pytorch_cos_sim(sentence_embedding1, sentence_embedding2)

print(f"cosine similarity is {similarity.item()}")


cosine similarity is 0.9216710329055786


# **SBERT**

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

# Split texts into lists of sentences.
sentences1 = preprocessed_text1.split('. ')
sentences2 = preprocessed_text2.split('. ')

# Encode sentences to get their embeddings
embedding1 = model.encode(sentences1, convert_to_tensor=True)
embedding2 = model.encode(sentences2, convert_to_tensor=True)

# Calculate pairwise cosine similarities between all sentences in both sets
similarity_matrix = util.pytorch_cos_sim(embedding1, embedding2)

# Define a similarity threshold for "high similarity"
similarity_threshold = 0.6

# Extract the sentences and scores for pairs above the threshold
high_similarity_pairs = [{
    'Sentences from rule 7': sentences1[i],
    'Sentences from rule 19': sentences2[j],
    'Cosine Similarity Score': similarity_matrix[i, j].item()
} for i in range(similarity_matrix.size(0)) for j in range(similarity_matrix.size(1)) if similarity_matrix[i, j] > similarity_threshold]

high_similarity_df = pd.DataFrame(high_similarity_pairs)

high_similarity_df.sort_values(by='Cosine Similarity Score', ascending=False, inplace=True)


In [None]:
high_similarity_df.to_csv('Similar_sentences_file2.csv', index=False)


In [None]:
high_similarity_df

Unnamed: 0,Sentences from rule 7,Sentences from rule 19,Cosine Similarity Score
5,ccas 17ad22e7 covered clearing agency shall es...,ccas 17ad22e19 covered clearing agency shall e...,0.818542
14,respect member sld requirement nscc provides r...,particular nscc requires member submit informa...,0.691035
13,respect member sld requirement nscc provides r...,thereafter part ongoing member due diligence p...,0.686795
3,key consideration 10 fmi establish explicit ru...,key consideration 1 fmi ensure rule procedure ...,0.675546
19,entity act otherwise member thus subject ongoi...,thereafter part ongoing member due diligence p...,0.647622
0,key consideration 7 fmi obtain high degree con...,key consideration 1 fmi ensure rule procedure ...,0.634824
18,entity act otherwise member thus subject ongoi...,ccas 17ad22e19 covered clearing agency shall e...,0.631049
1,key consideration 7 fmi obtain high degree con...,key consideration 3 fmi identify indirect part...,0.628933
6,ccas 17ad22e7 covered clearing agency shall es...,thereafter part ongoing member due diligence p...,0.627752
7,liquidity risk management framework nscc affil...,ccas 17ad22e19 covered clearing agency shall e...,0.62043


# Calculating the overall cosine similarity between both rules using SBERT

In [None]:
# Calculate the average embedding for each document
avg_embedding1 = torch.mean(embedding1, dim=0)
avg_embedding2 = torch.mean(embedding2, dim=0)

# Calculate the cosine similarity between the average embeddings of the two documents
overall_similarity = util.pytorch_cos_sim(avg_embedding1, avg_embedding2)

print("Overall similarity between the two documents:", overall_similarity.item())

Overall similarity between the two documents: 0.7677137851715088
