### Connect to G-Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

### Install Dependency

#### With Pre-Trained Models

In [None]:
!pip install sentence-transformers spacy-transformers nltk laserembeddings fuzzywuzzy jellyfish python-Levenshtein

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0

In [None]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.10/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [None]:
!python -m spacy download en_core_web_sm
# # !python -m spacy download en_core_web_md
# # !python -m spacy download en_core_web_lg
# # !python -m spacy download en_core_web_trf

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!git clone https://github.com/sherozshaikh/text_to_vector_embedding_pipeline.git

Cloning into 'text_to_vector_embedding_pipeline'...
remote: Enumerating objects: 78, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 78 (delta 42), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (78/78), 198.55 KiB | 6.85 MiB/s, done.
Resolving deltas: 100% (42/42), done.


In [None]:
%cd text_to_vector_embedding_pipeline

/content/text_to_vector_embedding_pipeline


In [None]:
import time
from typing import List
import numpy as np
import pandas as pd
import os
import zipfile
import warnings
warnings.filterwarnings("ignore")
import re
from difflib import get_close_matches as difflib_get_close_matches
from difflib import SequenceMatcher as difflib_sequencematcher
from functools import partial
from fuzzywuzzy import fuzz
from fuzzywuzzy import process as fuzzywuzzy_process
from jellyfish import jaro_winkler_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Pre-Trained Models
import torch
from transformers import AutoTokenizer,AutoModel
from sentence_transformers import SentenceTransformer
from laserembeddings import Laser
import gensim.downloader as api
import spacy
from model_fetcher import HuggingFaceModelFetcher
from embedding import TextEmbedding
text_to_vector = TextEmbedding()

# For Text Processing
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download(['punkt', 'stopwords'])
# nltk_stopwords_set = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##### Loading Laser Embedding

In [None]:
laser_embeddings = Laser()

##### Loading [spaCy](https://spacy.io/models) Model

In [None]:
nlp_spacy_model = spacy.load("en_core_web_sm",enable = ["tok2vec"])

##### Loading [gensim](https://radimrehurek.com/gensim/models/word2vec.html) Model

In [None]:
word_embedding_model = api.load("glove-wiki-gigaword-50")



##### Loading [Sentence Transformer](https://huggingface.co/sentence-transformers) Model

In [None]:
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

##### Loading [Pre-Trained Models](https://huggingface.co/models) Model

In [None]:
hf_model_name:str = "google-bert/bert-base-uncased"
hf_tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
hf_model = AutoModel.from_pretrained(hf_model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

#### Without Pre-Trained Models

In [None]:
!pip install fuzzywuzzy jellyfish python-Levenshtein nltk

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.25.1 (from python-Levenshtein)
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m797.4 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-Levenshtein)
  Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fuzzywuzzy, rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.25.1 fuzzywuzzy-0.18.0 python-Levenshtein-0.25.1 rapidfuzz-3.9.3


In [None]:
import time
print('Loading Libraries: ',time.ctime())
from typing import List
import numpy as np
import pandas as pd
import os
import zipfile
import warnings
warnings.filterwarnings("ignore")
import re
from difflib import get_close_matches as difflib_get_close_matches
from difflib import SequenceMatcher as difflib_sequencematcher
from functools import partial
from fuzzywuzzy import fuzz
from fuzzywuzzy import process as fuzzywuzzy_process
from jellyfish import jaro_winkler_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For Text Processing
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download(['punkt', 'stopwords'])
# nltk_stopwords_set = set(stopwords.words('english'))

Loading Libraries:  Sun Jun 23 13:57:06 2024


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Chunk Matching Using Embeddings and Cosine Similarity

###### Objective
The script is designed to compare chunks of text extracted from two PDF files (PDF1 and PDF2) using advanced natural language processing **`(NLP)`** techniques. Each chunk represents a segment of text from the PDF, treated as a standalone document unit. The goal is to determine how closely related chunks from PDF1 are to chunks from PDF2, facilitating alignment and comparison between the two documents.

###### Methodology
1. **`Text Chunk Extraction`**: Chunks of text are extracted from PDF1 and PDF2. Each chunk can be considered as a paragraph, section, or any logical division within the document.

2. **`Embedding Generation`**: Each chunk is transformed into a numerical representation (embedding) using state-of-the-art NLP models such as Word2Vec, Pre-Trained Models like GPT or BERT, LASER Embedding, spaCy, or Sentence Transformers. These embeddings capture semantic meaning and context, enabling a more nuanced comparison than traditional methods.

3. **`Cosine Similarity Calculation`**: Using cosine similarity, the script measures how similar the embeddings of chunks from PDF1 are to those from PDF2. Cosine similarity ranges from -1 (completely dissimilar) to 1 (identical), with 0 indicating no similarity.

4. **`Mapping and Alignment`**: Based on the cosine similarity scores, chunks from PDF1 are mapped to corresponding chunks from PDF2. This mapping helps identify which chunks in PDF1 correspond most closely to chunks in PDF2, facilitating cross-document analysis and alignment.

---

###### Effectiveness
This approach is particularly effective in scenarios such as:

- **`Comparative Analysis`**: When comparing two versions of a document (e.g., before and after revisions), the script can highlight corresponding sections, aiding in understanding changes made.
  
- **`Research Synthesis`**: In academic research or literature review, matching chunks between two studies or documents can streamline comparison and synthesis of findings.

- **`Legal and Regulatory Compliance`**: For compliance reviews, mapping sections between regulatory documents ensures consistency and completeness in meeting requirements by comparing versions to identify modifications and implications.

- **`Contract Management`**: Streamlining contract revisions by comparing versions to identify changes in terms and conditions, ensuring compliance and accuracy.

- **`Financial Reporting`**:  Comparing quarterly financial statements to track discrepancies and ensure consistency in reporting across periods.

- **`Technical Documentation`**:  Aligning updates in technical manuals or engineering specifications to maintain accuracy and consistency in product documentation.

- **`Compliance Audits`**: Ensuring regulatory compliance by comparing policy updates and procedures across audits to identify gaps and ensure adherence.

---

###### Multiple Model Integration
To enhance accuracy and robustness, the script supports multiple NLP models for embedding generation:

- **`Word2Vec`**: Offers traditional word embeddings based on co-occurrence statistics, useful for capturing semantic similarities between words.

- **`Sentence Transformers`**: Transforms entire sentences or paragraphs into embeddings optimized for semantic similarity tasks, enabling comparisons of longer text segments.

- **`Pre-Trained Model (BERT/GPT)`**: Provides contextualized embeddings capturing fine-grained meaning and relationships within text.

- **`TF-IDF (Term Frequency-Inverse Document Frequency)`**: Computes the importance of each word in a document relative to a collection of documents, providing a basic measure of word relevance.

- **`Spacy Model`**: Utilizes linguistic annotations and trained pipelines to generate word vectors and document embeddings, leveraging syntactic and semantic features.

- **`LASER (Language-Agnostic SEntence Representations)`**: Produces multilingual sentence embeddings trained on large-scale parallel corpora, facilitating cross-linguistic text comparisons.

---

###### Final Weighted Score Generation
Using the script we can aggregate results from multiple models, assigning higher weights to outputs from pre-trained models like BERT, known for their superior performance in capturing semantic relationships. Lower weights are assigned to simpler models like Word2Vec, which provide complementary insights.

By synthesizing outputs from different models into a final weighted score, the script can be used to offers a comprehensive view that balances accuracy with computational efficiency, catering to diverse analytical needs.

###### Example
Consider a scenario where PDF1 contains a technical report on renewable energy projects, and PDF2 includes a funding proposal for similar projects. Using the script:
- Chunks from PDF1 describing project specifications can be mapped to corresponding sections in PDF2 detailing funding requirements.
- The cosine similarity scores indicate the degree of alignment, helping stakeholders identify critical overlaps and discrepancies between project plans and funding proposals.

This methodological approach not only enhances document comparison but also supports decision-making processes by providing structured insights into document relationships.

In [None]:
def custom_ram_cleanup_func()->None:
  """
  Clean up global variables except for specific exclusions and system modules.

  This function deletes all global variables except those specified in
  `exclude_vars` and variables starting with underscore ('_').

  Excluded variables:
  - Modules imported into the system (except 'sys' and 'os')
  - 'sys', 'os', and 'custom_ram_cleanup_func' itself

  Returns:
  None
  """

  import sys
  all_vars = list(globals().keys())
  exclude_vars = list(sys.modules.keys())
  exclude_vars.extend(['In','Out','_','__','___','__builtin__','__builtins__','__doc__','__loader__','__name__','__package__','__spec__','_dh','_i','_i1','_ih','_ii','_iii','_oh','exit','get_ipython','quit','sys','os','custom_ram_cleanup_func',])
  for var in all_vars:
      if var not in exclude_vars and not var.startswith('_'):
          del globals()[var]
  del sys
  return None


In [None]:
class DocMapper():
  """
  A class to find close elements links between two documents.
  """
  def __init__(self,doc1_elements_list:List[str],doc2_elements_list:List[str],doc1_elements_embedding:np.ndarray=None,doc2_elements_embedding:np.ndarray=None,threshold_:float=0.6,output_folder:str='Mapped_Attributes'):
    """
    Initialize the DocMapper class.

    Args:
    - doc1_elements_list (List[str]): List containing Doc1 Elements.
    - doc2_elements_list (List[str]): List containing Doc2 Elements.
    - doc1_elements_embedding (ndarray, optional): Doc1 Embeeding Vector.
    - doc2_elements_embedding (ndarray, optional): Doc2 Embeeding Vector.
    - threshold_ (float, optional), default = 0.6: Threshold value for filtering similarity scores.
    - output_folder (str), default = Mapped_Attributes: Output Folder Name.

    """
    self.doc1_elements_list:List[str] = doc1_elements_list
    self.doc2_elements_list:List[str] = doc2_elements_list
    self.doc1_elements_embedding:np.ndarray = doc1_elements_embedding
    self.doc2_elements_embedding:np.ndarray = doc2_elements_embedding
    self.threshold_:float = threshold_
    self.output_folder:str = self.trim_characters(stxt=output_folder).replace(' ','_')

  def __repr__(self):
    """
    Returns a string representation of the class instance.
    """
    return f"DocMapper()"

  def __str__(self):
    """
    Returns a description of the class.
    """
    return "Class to fetch Similar Doc1 Elements for given Doc2 Elements."

  def get_jaro_winkler_similarity(self,sent_1:str,sent_2:str)->float:
    """
    Calculates the Jaro-Winkler similarity between two strings.

    Args:
    - sent_1 (str): First string.
    - sent_2 (str): Second string.

    Returns:
    - float: similarity score (percentage).
    """
    return (jaro_winkler_similarity(sent_1,sent_2))*100

  def get_minhash_containment_distance(self,sent_1:str,sent_2:str)->float:
    """
    Calculates the MinHash containment distance between two strings.

    Args:
    - sent_1 (str): First string.
    - sent_2 (str): Second string.

    Returns:
    - float: MinHash containment distance score.
    """
    sent_1_len,sent_2_len,sent_1_2_len=len(sent_1),len(sent_2),len(set(sent_1).intersection(set(sent_2)))
    if sent_1_len>sent_2_len:
      return sent_1_2_len / sent_1_len
    else:
      return sent_1_2_len / sent_2_len

  def find_closest_matches_difflib(self,word_1:str,possibilities_1:List[str],n_1=10,cutoff_1=0.5)->list:
    """
    Finds closest matches using difflib's get_close_matches function.

    Args:
    - word_1 (str): Word to find matches for.
    - possibilities_1 (List[str]): List of possible matches.
    - n_1 (optional), default = 10: Maximum number of matches to return.
    - cutoff_1 (optional), default = 0.5: Similarity cutoff threshold.

    Returns:
    - list: List of closest matches.
    """
    return difflib_get_close_matches(word=word_1,possibilities=possibilities_1,n=n_1,cutoff=cutoff_1)

  def find_closest_matches_fuzzywuzzy(self,query_1:str,choices_1:List[str],limit_1=10)->list:
    """
    Finds closest matches using fuzzywuzzy's process function.

    Args:
    - query_1 (str): Query string.
    - choices_1 (List[str]): List of choices to match against.
    - limit_1 (optional): Maximum number of matches to return.

    Returns:
    - list: List of closest matches.
    """
    return fuzzywuzzy_process.extract(query=query_1,choices=choices_1,limit=limit_1)

  def process_batch_difflib(self,new_elements:str,pre_defined_elements:List[str])->dict:
    """
    Processes a batch of elements using difflib for finding closest matches.

    Args:
    - new_elements (str): New element to match.
    - pre_defined_elements (List[str]): List of pre-defined elements to match against.

    Returns:
    - dict: Dictionary containing element and its closest matches with scores.
    """
    return {'Element':new_elements,'similar':[(item,(difflib_sequencematcher(None,new_elements,item).ratio())*100) for item in self.find_closest_matches_difflib(word_1=new_elements,possibilities_1=pre_defined_elements)]}

  def process_batch_fuzzywuzzy(self,new_elements:str,pre_defined_elements:List[str])->dict:
    """
    Processes a batch of elements using fuzzywuzzy for finding closest matches.

    Args:
    - new_elements (str): New element to match.
    - pre_defined_elements (List[str]): List of pre-defined elements to match against.

    Returns:
    - dict: Dictionary containing element and its closest matches with scores.
    """
    return {'Element':new_elements,'similar':self.find_closest_matches_fuzzywuzzy(query_1=new_elements,choices_1=pre_defined_elements)}

  def trim_characters(self,stxt:str='')->str:
    """
    Removes non-alphanumeric characters from a string.

    Args:
    - stxt (str): Input string.

    Returns:
    - str: String with non-alphanumeric characters removed.
    """
    return re.compile(pattern=r'\s+').sub(repl=r' ',string=str(re.compile(pattern=r'[^a-zA-Z\d]').sub(repl=r' ',string=str(stxt)))).strip()

  def create_final_folder(self)->None:
    """
    Creates Output Folder.
    If the folder already exists, it is first removed along with all its contents, and then a new empty folder is created.

    Returns:
    - None
    """
    if os.path.exists(path=self.output_folder):
      os.rmdir(path=self.output_folder)
    os.mkdir(path=self.output_folder)
    return None

  def create_final_zip(self)->None:
    """
    Creates a ZIP archive of all the contents.
    This method walks through the directory structure, adds all files to a ZIP archive, and stores it as '.zip'.

    Returns:
    - None
    """
    # Creates ZIP
    with zipfile.ZipFile(file=self.output_folder+'.zip',mode='w',compression=zipfile.ZIP_DEFLATED) as zip_file:
      for all_root,all_dirs,all_files in os.walk(self.output_folder):
        for file_1 in all_files:
          temp_file_path = os.path.join(all_root,file_1)
          zip_file.write(
            temp_file_path,
            os.path.relpath(temp_file_path,self.output_folder)
            )

    zip_file_path:str = self.output_folder+'.zip'
    target_folder_path:str = self.output_folder
    os.rename(os.path.abspath(zip_file_path),os.path.abspath(os.path.join(target_folder_path,zip_file_path)))
    return None

  def pre_processing_text_values(self,txt:str='',is_lower:bool=True,remove_characters:bool=True)->str:
    """
    Pre-processes text values by lowercasing, removing non-alphanumeric characters, and tokenizing.

    Args:
    - txt (str): Input text.
    - is_lower (bool, optional), default = True: Convert text to lowercase.
    - remove_characters (bool, optional), default = True: Remove non-alphanumeric characters.

    Returns:
    - str: Pre-processed text.
    """
    if is_lower:
      txt:str=str(txt).lower().strip()
    else:
      txt:str=str(txt).strip()

    if remove_characters:
      txt:str=self.trim_characters(stxt=txt)
    else:
      pass

    return ' '.join([x for x in word_tokenize(txt) if x.isalnum()])

  def get_all_similarity_scores(self,row1:pd.Series)->pd.Series:
    """
    Calculates similarity scores between Doc1 and Doc2 Elements using various metrics.

    Args:
    - row1 (pd.Series): Input row containing 'doc2_elements' and 'doc1_elements'.

    Returns:
    - pd.Series: Series with similarity scores appended.
    """
    doc1_,doc2_=row1['doc1_elements'],row1['doc2_elements']
    for metric_name,metric_func in high_score_text_metrics.items():
      if metric_name in ['get_minhash_containment_distance']:
        row1[metric_name.replace('get_','high_t_')]=round(number=metric_func(sent_1=doc1_,sent_2=doc2_)*100,ndigits=6)
      else:
        row1[metric_name.replace('get_','high_t_')]=round(number=metric_func(sent_1=doc1_,sent_2=doc2_),ndigits=6)
    row1['Score']:float = round(number=(row1['Score']*100),ndigits=6)
    return row1

  def calculate_similarity_tfidf(self,texts1:List[str],texts2:List[str])->np.ndarray:
    """
    Calculates TF-IDF cosine similarity between two lists of texts.

    Args:
    - texts1 (List[str]): List of first texts.
    - texts2 (List[str]): List of second texts.

    Returns:
    - np.ndarray: Similarity score matrix.
    """
    tfidf_vectorizer = TfidfVectorizer(decode_error = 'strict',use_idf = True,smooth_idf = True,binary = False,lowercase = True,max_features = 30_000,dtype = np.float32,ngram_range = (1,4),stop_words = 'english').fit(texts1+texts2)
    texts1_matrix:np.ndarray = tfidf_vectorizer.transform(texts1)
    texts2_matrix:np.ndarray = tfidf_vectorizer.transform(texts2)
    similarity_score_matrix:np.ndarray = cosine_similarity(texts1_matrix,texts2_matrix)
    return similarity_score_matrix

  def calculate_similarity_score(self,texts1_matrix:np.ndarray,texts2_matrix:np.ndarray)->np.ndarray:
    """
    Calculates cosine similarity between two matrices of texts.

    Args:
    - texts1_matrix (np.ndarray): First matrix of texts.
    - texts2_matrix (np.ndarray): Second matrix of texts.

    Returns:
    - np.ndarray: Similarity score matrix.
    """
    return cosine_similarity(texts1_matrix,texts2_matrix)

  def filter_similarity_matrix(self,similarity_matrix:np.ndarray,threshold_val:float=0.65)->pd.DataFrame:
    """
    Filters similarity matrix based on a threshold value.

    Args:
    - similarity_matrix (np.ndarray): Similarity score matrix.
    - threshold_val (float, optional), default = 0.65: Threshold value for similarity. Update using "threshold_" during initialization

    Returns:
    - pd.DataFrame: Filtered DataFrame with relevant attributes and scores.
    """
    relevant_indices:np.ndarray = np.argwhere(similarity_matrix > threshold_val)
    ids_1:np.ndarray = relevant_indices[:, 0]
    ids_2:np.ndarray = relevant_indices[:, 1]
    filtered_scores:np.ndarray = similarity_matrix[ids_1,ids_2]
    results:list = [{
        'doc1_elements': self.doc1_elements_list[i],
        'doc2_elements': self.doc2_elements_list[j],
        'Score': k,
    } for i, j, k in zip(ids_1, ids_2, filtered_scores)]
    results_df:pd.DataFrame = pd.DataFrame(results).sort_values(by=['doc1_elements','Score'],ascending=[True,False])
    return results_df

  def main(self)->None:
    """
    Main function to perform attribute mapping and write results to a CSV file.
    """
    # Define similarity metrics
    high_score_text_metrics:dict={
      'get_jaro_winkler_similarity':self.get_jaro_winkler_similarity,
      'get_minhash_containment_distance':self.get_minhash_containment_distance,
      }

    # Calculate similarity scores based on the availability of an embedding model
    if (self.doc1_elements_embedding is not None and np.any(self.doc1_elements_embedding != None)) and (self.doc2_elements_embedding is not None and np.any(self.doc2_elements_embedding != None)):
      similarity_score:np.ndarray = self.calculate_similarity_score(texts1_matrix=self.doc1_elements_embedding,texts2_matrix=self.doc2_elements_embedding)
    else:
      processed_doc1_elements_list:list=[self.pre_processing_text_values(txt=x,is_lower=True,remove_characters=True) for x in self.doc1_elements_list]
      processed_doc2_elements_list:list=[self.pre_processing_text_values(txt=x,is_lower=True,remove_characters=True) for x in self.doc2_elements_list]
      similarity_score:np.ndarray = self.calculate_similarity_tfidf(texts1=processed_doc1_elements_list,texts2=processed_doc2_elements_list)

    # Filter and process the similarity matrix
    mapped_result_df:pd.DataFrame = self.filter_similarity_matrix(similarity_matrix=similarity_score,threshold_val=self.threshold_)
    temp_df:pd.DataFrame = mapped_result_df['doc1_elements'].value_counts().reset_index()
    temp_df:pd.DataFrame = temp_df[temp_df['count']>1]
    filtered_df:pd.DataFrame = mapped_result_df[mapped_result_df['doc1_elements'].isin(temp_df['doc1_elements'].tolist())]
    del temp_df

    # Group by Doc2 Elements and find nearest matches using difflib and fuzzywuzzy
    grouped_df:pd.DataFrame = filtered_df[['doc1_elements','doc2_elements']].groupby(by=['doc1_elements']).agg(list).reset_index()
    nearest_matches_difflib:pd.Series = grouped_df.apply(lambda x: self.process_batch_difflib(new_elements=x['doc1_elements'],pre_defined_elements=x['doc2_elements']),axis=1)
    nearest_matches_fuzzywuzzy:pd.Series = grouped_df.apply(lambda x: self.process_batch_fuzzywuzzy(new_elements=x['doc1_elements'],pre_defined_elements=x['doc2_elements']),axis=1)

    if not nearest_matches_difflib.empty:
      nearest_matches_difflib_df:pd.DataFrame = pd.DataFrame([(entry['Element'],similar[0],similar[1]) for entry in nearest_matches_difflib for similar in entry['similar']],columns=['doc1_elements','doc2_elements','difflib'])
    else:
      nearest_matches_difflib_df:pd.DataFrame = pd.DataFrame(columns=['doc1_elements','doc2_elements','difflib'])

    if not nearest_matches_fuzzywuzzy.empty:
      nearest_matches_fuzzywuzzy_df:pd.DataFrame = pd.DataFrame([(entry['Element'],similar[0],similar[1]) for entry in nearest_matches_fuzzywuzzy for similar in entry['similar']],columns=['doc1_elements','doc2_elements','fuzzywuzzy'])
    else:
      nearest_matches_fuzzywuzzy_df:pd.DataFrame = pd.DataFrame(columns=['doc1_elements','doc2_elements','fuzzywuzzy'])

    # Post-Processing
    nearest_matches_df:pd.DataFrame = pd.merge(left=nearest_matches_difflib_df,right=nearest_matches_fuzzywuzzy_df,on=['doc1_elements','doc2_elements'],how='outer')
    nearest_matches_df:pd.DataFrame = nearest_matches_df[(nearest_matches_df['difflib']>self.threshold_) & (nearest_matches_df['fuzzywuzzy']>self.threshold_)]
    nearest_matches_df:pd.DataFrame = nearest_matches_df[['doc1_elements','doc2_elements','difflib']]
    nearest_matches_df.columns = ['doc1_elements','doc2_elements','Score']
    mapped_result_df['Score']:pd.Series = mapped_result_df['Score']*100
    mapped_result_df['Score']:pd.Series = mapped_result_df['Score'].round(4)
    consolidated_result_df:pd.DataFrame = pd.concat(objs=[mapped_result_df,nearest_matches_df])
    consolidated_result_df:pd.DataFrame = consolidated_result_df.sort_values(by=['doc1_elements','Score'],ascending=[True,False])
    consolidated_result_df:pd.DataFrame = consolidated_result_df.drop_duplicates(subset=['doc1_elements','doc2_elements'])
    doc1_not_found_df:pd.DataFrame = pd.DataFrame(data=set(self.doc1_elements_list).difference(set(consolidated_result_df['doc1_elements'].drop_duplicates().dropna().tolist())),columns=['doc1_elements'])
    doc2_not_found_df:pd.DataFrame = pd.DataFrame(data=set(self.doc2_elements_list).difference(set(consolidated_result_df['doc2_elements'].drop_duplicates().dropna().tolist())),columns=['doc2_elements'])
    consolidated_result_df.columns = ['Doc1_elements','Doc2_elements','Score']
    del nearest_matches_difflib_df,nearest_matches_fuzzywuzzy_df,grouped_df,nearest_matches_difflib,nearest_matches_fuzzywuzzy,mapped_result_df,nearest_matches_df

    # Finalize and write the results to a CSV file
    self.create_final_folder() # create folder

    consolidated_result_df.to_hdf(path_or_buf=self.output_folder+'/Mapping.h5',key='compress',mode='w',encoding='utf-8') # save in hdf file format

    with pd.ExcelWriter(path=self.output_folder+'/Mapping.xlsx',mode='w') as file_writer: # save in excel file format
      consolidated_result_df.to_excel(excel_writer=file_writer,sheet_name='Mapping',index=False)
      doc1_not_found_df.to_excel(excel_writer=file_writer,sheet_name='Doc1_NotFound',index=False)
      doc2_not_found_df.to_excel(excel_writer=file_writer,sheet_name='Doc2_NotFound',index=False)
    del consolidated_result_df,doc1_not_found_df,doc2_not_found_df

    self.create_final_zip() # create zip
    print('wrote the final output to local: ',time.ctime())
    return None


### Simulation Run

In [None]:
doc1_elements:list=[
  "As part of our construction project, we will install solar panels on the rooftop to harness renewable energy and reduce our carbon footprint. This initiative aligns with our commitment to sustainability and will contribute significantly to our energy independence.",
  "Construction of a new office building with sustainable materials and energy-efficient systems to minimize environmental impact.",
  "Construction of a parking structure for employees.",
  "Construction of bicycle storage facilities to encourage eco-friendly transportation choices.",
  "Construction of new office building with sustainable materials.",
  "Construction of parking structure for employee convenience.",
  "Creation of outdoor recreational spaces to promote employee well-being and encourage physical activity.",
  "Design and construction of pedestrian-friendly pathways to enhance accessibility and safety.",
  "Design and implementation of a green roof to enhance biodiversity and mitigate urban heat island effects.",
  "Design and implementation of green roof for biodiversity.",
  "Design and installation of green walls to improve air quality and provide natural insulation.",
  "Implementation of energy-efficient HVAC systems to improve indoor air quality and reduce operational costs.",
  "Implementation of rainwater harvesting systems for water reuse and sustainability.",
  "Implementation of waste management solutions to reduce landfill waste and promote recycling efforts.",
  "Installation of electric vehicle charging stations to support sustainable commuting options.",
  "Installation of smart lighting solutions for energy savings and improved lighting quality.",
  "Installation of solar panels on the rooftop to harness renewable energy and reduce carbon footprint.",
  "Integration of digital security systems for enhanced building and data protection.",
  "Integration of fire safety systems to ensure building protection and occupant safety.",
  "Landscaping with native plants to promote environmental sustainability and enhance aesthetic appeal.",
  "Our scope of work involves renovating the existing electrical systems to meet current safety standards and enhance operational efficiency. We will implement advanced technologies to optimize energy consumption while ensuring reliability and compliance with regulatory requirements.",
  "Renovation of existing electrical systems to meet current safety standards and optimize energy consumption.",
  "The construction of a new parking structure for employees is essential to alleviate current parking shortages and improve accessibility. Our design focuses on maximizing space efficiency and incorporating eco-friendly materials to enhance the overall environmental impact of the project.",
  "Upgrade of building insulation to improve energy efficiency and maintain indoor comfort.",
  "Upgrade of elevators to meet accessibility standards and enhance user experience.",
  "Upgrade of plumbing infrastructure to enhance water conservation and ensure sustainable usage.",
  "We are tasked with designing and implementing a green roof atop the new office building to enhance biodiversity, improve air quality, and mitigate urban heat island effects. Our approach integrates innovative planting techniques and sustainable irrigation systems.",
  "We propose to construct a state-of-the-art office building equipped with sustainable materials and advanced energy-efficient systems to minimize environmental impact and enhance employee productivity. The project includes innovative design elements such as a green roof and solar panels integrated into the building's structure.",
]

In [None]:
doc2_elements:list=[
  "Coverage for building insulation upgrades, protecting against energy loss and maintaining thermal efficiency.",
  "Coverage for elevator upgrades and installations, protecting against mechanical failures and ensuring safety compliance.",
  "Coverage for employee parking structure construction, protecting against accidents and liability claims during construction phases.",
  "Coverage for green wall design and installation, protecting against maintenance issues and structural damage.",
  "Coverage for liability during the construction of office buildings, protecting against property damage and third-party claims.",
  "Coverage for pedestrian-friendly pathway projects, protecting against accidents and ensuring safe access.",
  "Coverage for smart lighting solutions, protecting against electrical failures and ensuring continuous operation.",
  "Insurance for digital security system integration, covering data breaches and system failures impacting security measures.",
  "Insurance for electric vehicle charging stations, covering equipment damage and liability related to charging operations.",
  "Insurance for HVAC system installations, covering system failures and performance issues impacting operational efficiency.",
  "Insurance for landscaping projects with native plants, covering damage and loss due to natural disasters and accidents.",
  "Insurance for rainwater harvesting systems, covering equipment failures and water quality issues.",
  "Insurance for solar panel installation projects, covering equipment damage and performance issues due to adverse conditions.",
  "Insuring green roofs against damage from severe weather conditions, vandalism, and structural issues.",
  "Our insurance package includes coverage specifically tailored for the solar panel installation project, offering protection against equipment damage, performance issues, and financial losses due to adverse weather conditions or operational failures.",
  "Our insurance plan includes coverage for the green roof of the new office building, protecting against damage from severe weather conditions, vandalism, and structural issues. We emphasize proactive risk management to safeguard your sustainable investments.",
  "Our insurance policy provides comprehensive coverage for all liability risks during the construction phases of the new office building project. We ensure protection against property damage, third-party injuries, and unforeseen events to safeguard your investment.",
  "Policy for bicycle storage facilities construction, providing coverage for theft, damage, and liability claims.",
  "Policy for construction of outdoor recreational spaces, providing coverage for construction-related risks and liability.",
  "Policy for fire safety system integration, providing coverage for fire incidents and compliance-related liabilities.",
  "Policy for plumbing infrastructure upgrades, providing coverage for leaks, failures, and water damage.",
  "Policy for renovations and upgrades of electrical systems, providing coverage for equipment failures and business interruption.",
  "Policy for waste management solutions, providing coverage for environmental liabilities and waste disposal incidents.",
  "We offer specialized insurance solutions for the renovation of electrical systems, providing coverage for equipment failures, electrical fires, and business interruption. Our policies are designed to minimize financial risks and ensure business continuity.",
  "We provide comprehensive insurance coverage for the construction of employee parking structures, protecting against structural damage, accidents during construction, and liability claims. Our tailored solutions ensure peace of mind and financial security.",
]

##### Vector Embedding using Traditional Methods

In [None]:
DocMapper(
  doc1_elements_list=doc1_elements,
  doc2_elements_list=doc2_elements,
  doc1_elements_embedding=None,
  doc2_elements_embedding=None,
  threshold_=0.7,
  output_folder='TF_IDF'
  ).main()

wrote the final output to local:  Sun Jun 23 14:28:38 2024


##### Vector Embedding using Word2Vec Models

In [None]:
DocMapper(
  doc1_elements_list=doc1_elements,
  doc2_elements_list=doc2_elements,
  doc1_elements_embedding=text_to_vector.get_word_embedding(texts=doc1_elements,model_name=word_embedding_model),
  doc2_elements_embedding=text_to_vector.get_word_embedding(texts=doc2_elements,model_name=word_embedding_model),
  threshold_=0.6,
  output_folder='Word2Vec'
  ).main()

wrote the final output to local:  Sun Jun 23 14:28:57 2024


##### Vector Embedding using Laser Embedding

In [None]:
DocMapper(
  doc1_elements_list=doc1_elements,
  doc2_elements_list=doc2_elements,
  doc1_elements_embedding=text_to_vector.get_laser_embeddings(texts=doc1_elements,model_name=laser_embeddings),
  doc2_elements_embedding=text_to_vector.get_laser_embeddings(texts=doc2_elements,model_name=laser_embeddings),
  threshold_=0.6,
  output_folder='Laser'
  ).main()

wrote the final output to local:  Sun Jun 23 14:29:10 2024


##### Vector Embedding using spaCy Model

In [None]:
DocMapper(
  doc1_elements_list=doc1_elements,
  doc2_elements_list=doc2_elements,
  doc1_elements_embedding=text_to_vector.get_spacy_embedding(texts=doc1_elements,model_name=nlp_spacy_model),
  doc2_elements_embedding=text_to_vector.get_spacy_embedding(texts=doc2_elements,model_name=nlp_spacy_model),
  threshold_=0.6,
  output_folder='spaCy'
  ).main()

wrote the final output to local:  Sun Jun 23 14:29:21 2024


##### Vector Embedding using Sentence Transformer Model

In [None]:
DocMapper(
  doc1_elements_list=doc1_elements,
  doc2_elements_list=doc2_elements,
  doc1_elements_embedding=text_to_vector.get_sentence_transformers_embedding(texts=doc1_elements,model_name=sentence_transformer_model),
  doc2_elements_embedding=text_to_vector.get_sentence_transformers_embedding(texts=doc2_elements,model_name=sentence_transformer_model),
  threshold_=0.6,
  output_folder='Sentence_Transformer'
  ).main()

wrote the final output to local:  Sun Jun 23 14:29:24 2024


##### Vector Embedding using Pre-Trained Model

In [None]:
DocMapper(
  doc1_elements_list=doc1_elements,
  doc2_elements_list=doc2_elements,
  doc1_elements_embedding=text_to_vector.get_pre_trained_models_embedding(texts=doc1_elements,model_name=hf_model,model_tokenizer=hf_tokenizer,custom_max_length=256),
  doc2_elements_embedding=text_to_vector.get_pre_trained_models_embedding(texts=doc2_elements,model_name=hf_model,model_tokenizer=hf_tokenizer,custom_max_length=256),
  threshold_=0.6,
  output_folder='BERT'
  ).main()

wrote the final output to local:  Sun Jun 23 14:30:03 2024


### Cleaning RAM

In [None]:
custom_ram_cleanup_func()
del custom_ram_cleanup_func

### Sparse to CSR Matrix (Conceptual)

In [17]:
import numpy as np
from scipy.sparse import csr_matrix

zero_data_matrix:np.ndarray = np.array(
                                        [[0, 0, 3, 0, 0],
                                        [4, 0, 0, 0, 0],
                                        [0, 0, 0, 7, 0],
                                        [1, 0, 0, 0, 0],
                                        [0, 2, 0, 3, 9],]
                                       )

scipy_csr_matrix:csr_matrix = csr_matrix(zero_data_matrix)
print("="*50,"\nOriginal Matrix:\n",zero_data_matrix)
print("="*50,"\nCSR Matrix:\n",scipy_csr_matrix)
print("="*50,"\nCSR Matrix Data (non-zero elements):\n",scipy_csr_matrix.data)
print("="*50,"\nCSR Matrix Indices (column indices of non-zero elements):\n",scipy_csr_matrix.indices)
print("="*50,"\nCSR Matrix Indptr (index pointers to CSR row starts):\n",scipy_csr_matrix.indptr)
print("="*50,"\nA = ",f"{scipy_csr_matrix.A}")
print("="*50,"\nH = ",f"{scipy_csr_matrix.H}")
print("="*50,"\nT = ",f"{scipy_csr_matrix.T}")
print("="*50,"\ngetH = ",f"{scipy_csr_matrix.getH()}")
print("="*50,"\nget_shape = ",f"{scipy_csr_matrix.get_shape()}")
print("="*50,"\ngetcol = ",f"{scipy_csr_matrix.getcol(0)}")
print("="*50,"\ngetformat = ",f"{scipy_csr_matrix.getformat()}")
print("="*50,"\ngetmaxprint = ",f"{scipy_csr_matrix.getmaxprint()}")
print("="*50,"\ngetnnz = ",f"{scipy_csr_matrix.getnnz()}")
print("="*50,"\ngetrow = ",f"{scipy_csr_matrix.getrow(0)}")

Original Matrix:
 [[0 0 3 0 0]
 [4 0 0 0 0]
 [0 0 0 7 0]
 [1 0 0 0 0]
 [0 2 0 3 9]]
CSR Matrix:
   (0, 2)	3
  (1, 0)	4
  (2, 3)	7
  (3, 0)	1
  (4, 1)	2
  (4, 3)	3
  (4, 4)	9
CSR Matrix Data (non-zero elements):
 [3 4 7 1 2 3 9]
CSR Matrix Indices (column indices of non-zero elements):
 [2 0 3 0 1 3 4]
CSR Matrix Indptr (index pointers to CSR row starts):
 [0 1 2 3 4 7]
A =  [[0 0 3 0 0]
 [4 0 0 0 0]
 [0 0 0 7 0]
 [1 0 0 0 0]
 [0 2 0 3 9]]
H =    (2, 0)	3
  (0, 1)	4
  (3, 2)	7
  (0, 3)	1
  (1, 4)	2
  (3, 4)	3
  (4, 4)	9
T =    (2, 0)	3
  (0, 1)	4
  (3, 2)	7
  (0, 3)	1
  (1, 4)	2
  (3, 4)	3
  (4, 4)	9
getH =    (2, 0)	3
  (0, 1)	4
  (3, 2)	7
  (0, 3)	1
  (1, 4)	2
  (3, 4)	3
  (4, 4)	9
get_shape =  (5, 5)
getcol =    (1, 0)	4
  (3, 0)	1
getformat =  csr
getmaxprint =  50
getnnz =  7
getrow =    (0, 2)	3
