### Installing Dependency Modules

In [None]:
!pip install sentence-transformers spacy-transformers nltk laserembeddings

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (313 kB)
[2K

In [None]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.10/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [None]:
!python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_trf

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### Loading Libraries

In [None]:
from typing import List,Dict
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# For Hugging Face Model Fetcher Class
import json
import requests
from bs4 import BeautifulSoup

# For Pre-Trained Model
import torch
from transformers import AutoTokenizer,AutoModel
from sentence_transformers import SentenceTransformer

# For Laser Embedding Model
from laserembeddings import Laser

# For Word2Vec Model
import gensim.downloader as api

# For spaCy Model
import spacy

# Embedding using Traditional Methods
from sklearn.decomposition import TruncatedSVD,PCA,KernelPCA,SparsePCA,MiniBatchSparsePCA,NMF,MiniBatchNMF,FactorAnalysis,FastICA
from sklearn.random_projection import GaussianRandomProjection,SparseRandomProjection
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.manifold import LocallyLinearEmbedding,Isomap
from sklearn.pipeline import Pipeline

# For Text Processing
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download(['punkt', 'stopwords'])
nltk_stopwords_set = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Creating `requirements.txt`

In [None]:
!pip freeze

absl-py==1.4.0
aiohttp==3.9.5
aiosignal==1.3.1
alabaster==0.7.16
albumentations==1.3.1
altair==4.2.2
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.15.1
astropy==5.3.4
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==23.2.0
audioread==3.0.1
autograd==1.6.2
Babel==2.15.0
backcall==0.2.0
beautifulsoup4==4.12.3
bidict==0.23.1
bigframes==1.8.0
bleach==6.1.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.3.4
bqplot==0.12.43
branca==0.7.2
build==1.2.1
CacheControl==0.14.0
cachetools==5.3.3
catalogue==2.0.10
certifi==2024.6.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.86
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpathlib==0.18.1
cloudpickle==2.2.1
cmake==3.27.9
cmdstanpy==1.2.3
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.5
cons==0.4.6
contextlib2==21.6.0
contourpy==1.2.1
cryptography==42.0.8
cuda-python==12.2.1
cudf-cu12 @ https://pypi.nvidia.c

### Loading Laser Embedding

In [None]:
laser_embeddings = Laser()

### Loading [spaCy](https://spacy.io/models) Model

In [None]:
nlp_spacy_model = spacy.load("en_core_web_sm",enable = ["tok2vec"])

### Loading [gensim](https://radimrehurek.com/gensim/models/word2vec.html) Model

In [None]:
word_embedding_model = api.load("glove-wiki-gigaword-50")



### Loading [Sentence Transformer](https://huggingface.co/sentence-transformers) Model

In [None]:
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Loading [Pre-Trained Models](https://huggingface.co/models) Model

In [None]:
hf_model_name:str = "xlnet-base-cased"
hf_tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
hf_model = AutoModel.from_pretrained(hf_model_name)

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

### Case Sensitivity in Embedding Model: 'A' vs 'a'

In [None]:
# Some of the characters are not present in this model
for i in [65,70,80,90,95,105,110,120]:
  stxt:str = chr(i)
  if stxt in word_embedding_model:
    # print(f'TRUE {i} -> {stxt}')
    continue
  else:
    print(f'FALSE {i} -> {stxt}')

FALSE 65 -> A
FALSE 70 -> F
FALSE 80 -> P
FALSE 90 -> Z


In [None]:
print(('A' in word_embedding_model),('a' in word_embedding_model))

False True


In [None]:
for i in [65,70,80,90,95,105,110,120]:
  stxt:str = chr(i)
  if nlp_spacy_model(stxt).vector.shape[0] == 96:
    # print(f'TRUE {i} -> {stxt}')
    continue
  else:
    print(f'FALSE {i} -> {stxt}')

In [None]:
for i in [65,70,80,90,95,105,110,120]:
  stxt:str = chr(i)
  if laser_embeddings.embed_sentences([stxt],lang = 'en').shape[1] == 1024:
    # print(f'TRUE {i} -> {stxt}')
    continue
  else:
    print(f'FALSE {i} -> {stxt}')

### Custom Function to clear RAM

In [None]:
def custom_ram_cleanup_func()->None:
    """
    Clean up global variables except for specific exclusions and system modules.

    This function deletes all global variables except those specified in
    `exclude_vars` and variables starting with underscore ('_').

    Excluded variables:
    - Modules imported into the system (except 'sys' and 'os')
    - 'sys', 'os', and 'custom_ram_cleanup_func' itself

    Returns:
    None
    """

    import sys
    all_vars = list(globals().keys())
    exclude_vars = list(sys.modules.keys())
    exclude_vars.extend(['In','Out','_','__','___','__builtin__','__builtins__','__doc__','__loader__','__name__','__package__','__spec__','_dh','_i','_i1','_ih','_ii','_iii','_oh','exit','get_ipython','quit','sys','os','custom_ram_cleanup_func',])
    for var in all_vars:
        if var not in exclude_vars and not var.startswith('_'):
            del globals()[var]
    del sys
    return None


### Defining Hugging Face Model Fetcher

The Hugging Face Model Fetching and Sorting Class provides a streamlined way to explore and access a wide array of pre-trained models available on the Hugging Face Model Hub.

- **`Fetch Models`**: Quickly retrieve models by name and filter based on specific criteria such as task type.
  
- **`Sort by Popularity`**: Automatically sort models based on community ratings, including likes and downloads, to prioritize widely recognized and well-received models.

- **`Flexible Integration`**: Seamlessly integrates with your Python projects or Jupyter notebooks, offering direct access to model details and configurations without the need for a manual search.


**Widely Used Hugging Face Models**:

- [OpenAI Community](https://huggingface.co/openai-community) - `https://huggingface.co/openai-community`
- [Google BERT](https://huggingface.co/google-bert) - `https://huggingface.co/google-bert`
- [Facebook AI](https://huggingface.co/FacebookAI) - `https://huggingface.co/FacebookAI`
- [DistilBERT](https://huggingface.co/distilbert) - `https://huggingface.co/distilbert`
- [ALBERT](https://huggingface.co/albert) - `https://huggingface.co/albert`
- [Google](https://huggingface.co/google) - `https://huggingface.co/google`
- [Microsoft](https://huggingface.co/microsoft) - `https://huggingface.co/microsoft`
- [AllenAI](https://huggingface.co/allenai) - `https://huggingface.co/allenai`
- [XLNet](https://huggingface.co/xlnet) - `https://huggingface.co/xlnet`
- [Flair](https://huggingface.co/flair) - `https://huggingface.co/flair`
- [Nguyễn Xuân Huy](https://huggingface.co/nghuyong) - `https://huggingface.co/nghuyong`
- [Sentence Transformers](https://huggingface.co/sentence-transformers) - `https://huggingface.co/sentence-transformers`


In [None]:
class HuggingFaceModelFetcher():
    """
    A class to fetch model details from Hugging Face URLs and return as a Pandas DataFrame.

    Attributes:
    - url_to_parse (str): URL to parse for model details.
    - close_time (int), default = 10: Timeout duration in seconds for the HTTP request.

    Methods:
    - fetch_model_details(): Fetches model details from the specified URL.
    - show_help(): Prints helpful notes and links related to various embeddings and models.
    """

    def __init__(self,url_to_parse:str,close_time:int = 10):
        """
        Initialize the HuggingFaceModelFetcher with the URL and timeout duration.

        Args:
        - url_to_parse (str): URL to parse for model details.
        - close_time (int, optional), default = 10: Timeout duration in seconds for the HTTP request. Default is 10 seconds.
        """
        self.url_to_parse = url_to_parse
        self.close_time = close_time

    def __repr__(self):
        return f"HuggingFaceModelFetcher()"

    def __str__(self):
        return "Class to fetch huggingface models and sort based on downloads and likes."

    def show_help(self)->None:
        """
        Prints helpful notes and links related to various embeddings and models.
        """
        help_content = """
        Notes:

        Gensim API Models
        # https://radimrehurek.com/gensim/models/word2vec.html

        ERNIE Embedding
        https://huggingface.co/docs/transformers/en/model_doc/ernie

        FLAIR Embedding
        https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
        https://flairnlp.github.io/docs/tutorial-embeddings/flair-embeddings

        LASER Embedding
        https://pypi.org/project/laserembeddings/

        HuggingFace Models
        https://huggingface.co/openai-community
        https://huggingface.co/google-bert
        https://huggingface.co/FacebookAI
        https://huggingface.co/distilbert
        https://huggingface.co/albert
        https://huggingface.co/google
        https://huggingface.co/microsoft
        https://huggingface.co/allenai
        https://huggingface.co/xlnet
        https://huggingface.co/flair
        https://huggingface.co/nghuyong
        https://huggingface.co/sentence-transformers
        """

        print(help_content.strip())
        return None

    def get_model_details(self,parsed_response:Dict)->pd.DataFrame:
        """
        Extracts model details from parsed JSON response and returns a Pandas DataFrame.

        Args:
        - parsed_response (dict): Parsed JSON response containing model information.

        Returns:
        - pd.DataFrame: DataFrame with columns Model_Name,Likes,Downloads,and Pipeline_Tag.
        """
        model_name_list,likes_list,downloads_list,pipeline_tag_list = [],[],[],[]

        for item in parsed_response['repos']:
            model_name_list.append(item.get('id','NAME NOT FOUND'))
            likes_list.append(int(item.get('likes',0)))
            downloads_list.append(int(item.get('downloads',0)))
            pipeline_tag_list.append(item.get('pipeline_tag',''))

        model_df = pd.DataFrame({
            'Model_Name':model_name_list,
            'Likes':likes_list,
            'Downloads':downloads_list,
            'Pipeline_Tag':pipeline_tag_list,
        })

        model_df = model_df[model_df['Downloads'] > 0]
        model_df = model_df[model_df['Model_Name'] != 'NAME NOT FOUND']
        model_df = model_df.sort_values(by=['Downloads','Likes'],ascending=[False,False])

        return model_df

    def get_model_information(self,parsed_response:BeautifulSoup)->pd.DataFrame:
        """
        Parses BeautifulSoup object to extract model details.

        Args:
        - parsed_response (BeautifulSoup): Parsed HTML content.

        Returns:
        - pd.DataFrame: DataFrame with aggregated model details.
        """
        all_parsed_responses = []

        try:
            for div_element in parsed_response.find_all('div',class_='SVELTE_HYDRATER contents'):
                data_props = div_element.get('data-props')

                if data_props:
                    props_dict = json.loads(data_props)

                    if props_dict.get('repos'):
                        all_parsed_responses.append(props_dict)

        except Exception as e:
            print(f'Error in get_model_information:{e}')
            pass

        models_df = pd.DataFrame()

        for item in all_parsed_responses:
            try:
                models_df = pd.concat([models_df,self.get_model_details(parsed_response=item)])

            except Exception as e:
                continue

        return models_df

    def fetch_model_details(self)->pd.DataFrame:
        """
        Fetches model details from the specified URL and returns as a Pandas DataFrame.

        Returns:
        - pd.DataFrame: DataFrame with aggregated model details (Model_Name, Likes, Downloads, Pipeline_Tag).
        """
        try:
            url_response = requests.get(url=self.url_to_parse,timeout=self.close_time)

            if url_response.status_code == 200:
                parsed_response = BeautifulSoup(url_response.content,'html.parser')
                return self.get_model_information(parsed_response)

            else:
                print(f'Failed to retrieve content. Status code:{url_response.status_code}')
                return pd.DataFrame()

        except requests.exceptions.Timeout:
            print(f'Timeout error:The request timed out after {self.close_time} seconds.')
            return pd.DataFrame()

        except requests.exceptions.RequestException as e:
            print(f'Request error:{e}')
            return pd.DataFrame()


### Example Usage for Hugging Face Model Fetcher

In [None]:
hf_model_fetcher = HuggingFaceModelFetcher(url_to_parse='https://huggingface.co/allenai',close_time=10)

### Fetch widely used Hugging Face Models

In [None]:
print(hf_model_fetcher.show_help())

Notes:

        Gensim API Models
        # https://radimrehurek.com/gensim/models/word2vec.html

        ERNIE Embedding
        https://huggingface.co/docs/transformers/en/model_doc/ernie

        FLAIR Embedding
        https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
        https://flairnlp.github.io/docs/tutorial-embeddings/flair-embeddings

        LASER Embedding
        https://pypi.org/project/laserembeddings/

        HuggingFace Models
        https://huggingface.co/openai-community
        https://huggingface.co/google-bert
        https://huggingface.co/FacebookAI
        https://huggingface.co/distilbert
        https://huggingface.co/albert
        https://huggingface.co/google
        https://huggingface.co/microsoft
        https://huggingface.co/allenai
        https://huggingface.co/xlnet
        https://huggingface.co/flair
        https://huggingface.co/nghuyong
        https://huggingface.co/sentence-transformers
None


### Fetch Models based on supplied URL

In [None]:
model_results_df = hf_model_fetcher.fetch_model_details()

In [None]:
model_results_df

Unnamed: 0,Model_Name,Likes,Downloads,Pipeline_Tag
163,allenai/longformer-base-4096,148,5081078,
245,allenai/scibert_scivocab_uncased,106,2227262,
213,allenai/unifiedqa-v2-t5-large-1363200,3,157719,text2text-generation
227,allenai/t5-small-squad2-question-generation,38,68319,text2text-generation
54,allenai/specter2_aug2023refresh_base,2,40287,feature-extraction
...,...,...,...,...
102,allenai/ms2_dense_max,0,2,
7,allenai/SciRIFF-train-mix,3,1,
81,allenai/multixscience_sparse_oracle,1,1,
9,allenai/coconot,0,1,


### Defining Text to Vector Embedding

The **`Text to Vector Embedding`** class provides a versatile framework for generating embeddings from text data using various methods and models, facilitating optimal performance for various natural language processing tasks. Whether you're exploring semantic similarities, document classification, or recommendation systems, this class serves as a powerful asset for embedding text data effectively. This class supports multiple approaches including:

- **`Traditional Method`**: Utilizes various methods from scikit-learn such as using TF-IDF (Term Frequency - Inverse Document Frequency) vectorizer combined with Principal Component Analysis (PCA) for dimensionality reduction.
  
- **`Word Embedding Models`**: Integrates with Gensim's API to leverage pre-trained word embedding models for transforming text into dense vector representations.
  
- **`Sentence Transformers`**: Incorporates Hugging Face's Sentence Transformers for creating embeddings that capture semantic meanings and context from sentences.
  
- **`Tokenization and Embedding`**: Uses Hugging Face's AutoTokenizer to tokenize text and generate embeddings based on transformer models.

- **`Multi-Model Flexibility`**: Enables users to apply multiple embedding methods and models to the same text data, facilitating comparative analysis to determine the most effective embedding for their specific use case.
  
- **`Ease of Integration`**: Seamless integration with popular NLP libraries and APIs makes it straightforward to implement and experiment with different embedding techniques.
  
- **`Performance Evaluation`**: Empowers users to evaluate and compare embeddings generated by different models, optimizing performance for downstream NLP tasks such as classification, clustering, or information retrieval.

---

##### Example of Vector Embedding

Vector embedding turns words into lists of numbers. Imagine:

- **Word 1**: "cat" → `[0.5, 0.2, -0.1]`
- **Word 2**: "dog" → `[0.3, 0.6, 0.2]`
- **Word 3**: "rabbit" → `[-0.2, 0.4, -0.3]`

Each word is represented by a list of numbers (vectors). These numbers capture meanings like how similar words are or their relationships. Algorithms use these vectors to understand and compare words, making it easier to work with text in computer programs.

Vector embedding helps computers handle language tasks like finding similar words, classifying text, and understanding context. It's like giving words a numerical "fingerprint" that computers can analyze and use for different purposes in language processing.

Imagine words as people at a party. Each person (word) has unique traits (meanings). Word embedding gives each person a nametag with numbers (vectors) that describe their traits. Computers use these nametags to understand who's similar, who's related, and how they interact to understand language better!


---

##### Resources for Understanding Vector Embedding

- [The Illustrated Word2Vec](https://jalammar.github.io/illustrated-word2vec/) - `https://jalammar.github.io/illustrated-word2vec/`

In [None]:
class TextEmbedding():
    def __init__(self):
        pass

    def __repr__(self):
        return f"TextEmbedding()"

    def __str__(self):
        return "Class to embed text using various methods."

    def get_sklearn_embedding(self,texts:List[str] = [],custom_max_features:int = 5_000,custom_dtype:np.dtype = np.float64,is_lower:bool = True,use_stop_words:bool = True,custom_ngram_range:tuple = (1,1),vectorizer:str = 'tfidf',reduction_method:str = 'svd',embed_size:int = 50)->np.ndarray:
        """
        Convert a list of texts into embeddings using a selected vectorization method and dimensionality reduction method.

        Args:
        - texts (list): List of strings representing documents.
        - custom_max_features (int), default = 5000: Maximum number of features (terms) to be used in vectorization.
        - custom_dtype (np.dtype), default = np.float64: Data type of the matrix created by vectorization.
        - is_lower (bool), default = True: Convert text to lowercase.
        - use_stop_words (bool), default = True: Whether to use English stop words during vectorization.
        - custom_ngram_range (tuple), default = (1,1): Range for n-grams to be extracted.
          For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.
        - vectorizer (str), default = 'tfidf': Vectorization method to use.
          Options: 'tfidf','count','binary'.
        - reduction_method (str), default = 'svd': Dimensionality reduction method to use.
          Options: 'svd','pca','kernel_pca','sparse_pca','mini_batch_sparse_pca','nmf','mini_batch_nmf',
          'factor_analysis','fast_ica','isomap','locally_linear_embedding','gaussian_random_projection',
          'sparse_random_projection'.
        - embed_size (int), default = 50: Number of components to keep in the reduced embedding space.

        Returns:
        - np.ndarray: Embeddings of the input texts.
        """

        if vectorizer == 'tfidf':
            if use_stop_words:
                vectorizer = TfidfVectorizer(decode_error = 'strict',use_idf = True,smooth_idf = True,binary = False,lowercase = is_lower,max_features = custom_max_features,dtype = custom_dtype,ngram_range = custom_ngram_range,stop_words = 'english')
            else:
                vectorizer = TfidfVectorizer(decode_error = 'strict',use_idf = True,smooth_idf = True,binary = False,lowercase = is_lower,max_features = custom_max_features,dtype = custom_dtype,ngram_range = custom_ngram_range)
        elif vectorizer == 'count':
            if use_stop_words:
                vectorizer = CountVectorizer(decode_error = 'strict',binary = False,lowercase = is_lower,max_features = custom_max_features,dtype = custom_dtype,ngram_range = custom_ngram_range,stop_words = 'english')
            else:
                vectorizer = CountVectorizer(decode_error = 'strict',binary = False,lowercase = is_lower,max_features = custom_max_features,dtype = custom_dtype,ngram_range = custom_ngram_range)
        elif vectorizer == 'binary':
            if use_stop_words:
                vectorizer = CountVectorizer(decode_error = 'strict',binary = True,lowercase = is_lower,max_features = custom_max_features,dtype = custom_dtype,ngram_range = custom_ngram_range,stop_words = 'english')
            else:
                vectorizer = CountVectorizer(decode_error = 'strict',binary = True,lowercase = is_lower,max_features = custom_max_features,dtype = custom_dtype,ngram_range = custom_ngram_range)
        else:
            raise ValueError("Vectorizer must be 'tfidf', 'count' or 'binary'")

        if reduction_method == 'svd':
            reduction_method = TruncatedSVD(n_components = embed_size)
        elif reduction_method == 'pca':
            reduction_method = PCA(n_components = embed_size)
        elif reduction_method == 'kernel_pca':
            reduction_method = KernelPCA(n_components = embed_size)
        elif reduction_method == 'sparse_pca':
            reduction_method = SparsePCA(n_components = embed_size)
        elif reduction_method == 'mini_batch_sparse_pca':
            reduction_method = MiniBatchSparsePCA(n_components = embed_size)
        elif reduction_method == 'nmf':
            reduction_method = NMF(n_components = embed_size)
        elif reduction_method == 'mini_batch_nmf':
            reduction_method = MiniBatchNMF(n_components = embed_size)
        elif reduction_method == 'factor_analysis':
            reduction_method = FactorAnalysis(n_components = embed_size)
        elif reduction_method == 'fast_ica':
            reduction_method = FastICA(n_components = embed_size)
        elif reduction_method == 'isomap':
            reduction_method = Isomap(n_components = embed_size)
        elif reduction_method == 'locally_linear_embedding':
            reduction_method = LocallyLinearEmbedding(n_components = embed_size)
        elif reduction_method == 'gaussian_random_projection':
            reduction_method = GaussianRandomProjection(n_components = embed_size)
        elif reduction_method == 'sparse_random_projection':
            reduction_method = SparseRandomProjection(n_components = embed_size)
        else:
            raise ValueError("Reduction Method must be 'svd', 'pca', 'kernel_pca', 'sparse_pca', 'mini_batch_sparse_pca', 'nmf', 'mini_batch_nmf', 'factor_analysis', 'fast_ica', 'isomap', 'locally_linear_embedding', 'gaussian_random_projection' or 'sparse_random_projection'")

        custom_pipeline = Pipeline([('vectorizer',vectorizer),('reduction',reduction_method)])

        try:
            sklearn_embeddings = custom_pipeline.fit_transform(texts)
        except Exception as e:
            raise RuntimeError(f"Pipeline not properly fitted. {e}")

        return sklearn_embeddings

    def processing_word_embedding(self,stxt:str = '',model_ = None)->np.ndarray:
        """
        Retrieve the word embedding vector for a given word or calculate an average embedding for out-of-vocabulary words.

        Args:
        - stxt (str): Input word or character.
        - model_name: Pre-trained word embedding model.

        Returns:
        - np.ndarray: Embeddings of the input texts.
        """

        if stxt in model_:
            return model_[stxt]
        else:
            embedding_container:list = []
            for char in stxt:
                if char in model_:
                    embedding_container.append(model_[char])
                else:
                    if ord(char)<91:
                        char:str = char.lower()
                        if char in model_:
                            embedding_container.append(model_[char])
                        else:
                            pass
                    else:
                        char:str = char.upper()
                        if char in model_:
                            embedding_container.append(model_[char])
                        else:
                            pass
            return np.mean(np.array(embedding_container),axis = 0)

    def get_word_embedding(self,texts:List[str] = [],model_name = None)->np.ndarray:
        """
        Compute word embeddings using a pre-trained word embedding model.

        Args:
        - texts (list): List of strings representing documents.
        - model_name: Pre-trained word embedding model.

        Returns:
        - np.ndarray: Embeddings of the input texts.
        """

        return np.array([np.mean([self.processing_word_embedding(stxt = word,model_ = model_name) for word in word_tokenize(doc)],axis = 0) for doc in texts])

    def get_laser_embeddings(self,texts:List[str] = [])->np.ndarray:
        """
        Compute LASER (Language-Agnostic SEntence Representations) embeddings for a list of texts.

        Args:
        - texts (list): List of strings representing documents.

        Returns:
        - np.ndarray: LASER Embeddings of the input texts.
        """

        return laser_embeddings.embed_sentences(texts,lang = 'en')

    def get_spacy_embedding(self,texts:List[str] = [],nlp_spacy = None)->np.ndarray:
        """
        Compute spaCy word embeddings for a list of texts.

        Args:
        - texts (list): List of strings representing documents.

        Returns:
        - np.ndarray: spaCy embeddings of the input texts.
        """

        return np.array([nlp_spacy(doc).vector for doc in texts])

    def get_sentence_transformers_embedding(self,texts:List[str] = [])->np.ndarray:
        """
        Compute Sentence Transformers embeddings for a list of texts.

        Args:
        - texts (list): List of strings representing documents.

        Returns:
        - np.ndarray: Sentence Transformers embeddings of the input texts.
        """

        return sentence_transformer_model.encode(texts)

    def get_pre_trained_models_embedding(self,texts:List[str] = [],model_name = None,model_tokenizer = None,custom_max_length:int = 256)->np.ndarray:
        """
        Compute embeddings using a pre-trained transformer model (e.g.,BERT) and tokenizer.

        Args:
        - texts (list): List of strings representing documents.
        - model_name: Pre-trained transformer model instance.
        - model_tokenizer: Pre-trained tokenizer instance.
        - custom_max_length (int), default = 256: Maximum Length before truncation.

        Returns:
        - np.ndarray: Embeddings of the input texts using the pre-trained model.
        """

        model_encoded_inputs = model_tokenizer(texts,padding = True,truncation = True,max_length = custom_max_length,return_tensors = "pt")
        with torch.no_grad():
            model_encoded_outputs = model_name(**model_encoded_inputs)
        model_embeddings = model_encoded_outputs.last_hidden_state.mean(dim = 1)
        return model_embeddings.numpy()

    def remove_punctuation_strings(self,txt:str = '')->str:
        """
        Remove punctuation characters from a given string.

        Args:
        - txt (str): Input text to remove punctuation from.

        Returns:
        - str: Text with punctuation characters removed.
        """

        return txt.translate(str.maketrans('','',string.punctuation))

    def pre_processing_text(self,txt:str = '',is_lower:bool = True,stop_words_set:set = set(stopwords.words('english')),remove_stop_words:bool = True)->str:
        """
        Preprocesses the input text based on the specified options.

        Args:
        - txt (str): Input text to preprocess.
        - is_lower (bool), default = True: Flag indicating whether to convert text to lowercase (default = True).
        - stop_words_set (set), default = "nltk english stopwords": Set of stop words to remove from text (default = set()).
        - remove_stop_words (bool), default = True: Flag indicating whether to remove stop words (default = True).

        Returns:
        - str: Processed text after tokenization and preprocessing steps.
        """

        if is_lower:
            txt:str = str(txt).lower().strip()
        else:
            txt:str = str(txt).strip()

        if remove_stop_words:
            return ' '.join([x for x in word_tokenize(txt) if (x.isalnum() and x not in stop_words_set)])
        else:
            return ' '.join([x for x in word_tokenize(txt) if x.isalnum()])


In [None]:
text_to_vector = TextEmbedding()

### Loading a Test Dataset

In [None]:
test_file = pd.read_csv('Test1.csv',dtype='str',encoding='utf-8')
print(f'{test_file.shape}')

(12, 1)


In [None]:
test_file.head()

Unnamed: 0,Title
0,NATIONAL SAFETY APPAREL Aluminized Chaps: OPF/...
1,NATIONAL SAFETY APPAREL Aluminized Chaps: OPF/...
2,STEEL GRIP Pbi/Rayon Half Face Open Hood
3,"QUANTUM Wire Shelf: 36 in x 30 in, 1 Shelves, ..."
4,"QUANTUM Wire Shelf: 36 in x 30 in, 1 Shelves, ..."


In [None]:
test_sample = test_file.iloc[0]['Title']
print(f'{test_sample = }')

test_sample = 'NATIONAL SAFETY APPAREL Aluminized Chaps: OPF/Para-Aramid, Universal, 40 in Overall Lg'


### Removing Punctuation Strings & Pre-Processing Doc

In [None]:
print(text_to_vector.remove_punctuation_strings.__doc__)


        Remove punctuation characters from a given string.

        Args:
        - txt (str): Input text to remove punctuation from.

        Returns:
        - str: Text with punctuation characters removed.
        


In [None]:
text_to_vector.remove_punctuation_strings(txt = test_sample)

'NATIONAL SAFETY APPAREL Aluminized Chaps OPFParaAramid Universal 40 in Overall Lg'

In [None]:
test_file['Remove_Punctuation'] = test_file['Title'].apply(lambda x: text_to_vector.remove_punctuation_strings(txt=x))

In [None]:
print(text_to_vector.pre_processing_text.__doc__)


        Preprocesses the input text based on the specified options.

        Args:
        - txt (str): Input text to preprocess.
        - is_lower (bool), default = True: Flag indicating whether to convert text to lowercase (default = True).
        - stop_words_set (set), default = "nltk english stopwords": Set of stop words to remove from text (default = set()).
        - remove_stop_words (bool), default = True: Flag indicating whether to remove stop words (default = True).

        Returns:
        - str: Processed text after tokenization and preprocessing steps.
        


In [None]:
text_to_vector.pre_processing_text(txt = test_sample)

'national safety apparel aluminized chaps universal 40 overall lg'

In [None]:
test_file['Pre_Processing'] = test_file['Title'].apply(lambda x: text_to_vector.pre_processing_text(txt=x))

In [None]:
test_file.head(5).to_dict(orient='records')

[{'Title': 'NATIONAL SAFETY APPAREL Aluminized Chaps: OPF/Para-Aramid, Universal, 40 in Overall Lg',
  'Remove_Punctuation': 'NATIONAL SAFETY APPAREL Aluminized Chaps OPFParaAramid Universal 40 in Overall Lg',
  'Pre_Processing': 'national safety apparel aluminized chaps universal 40 overall lg'},
 {'Title': 'NATIONAL SAFETY APPAREL Aluminized Chaps: OPF/Para-Aramid, Universal, 38 in Overall Lg',
  'Remove_Punctuation': 'NATIONAL SAFETY APPAREL Aluminized Chaps OPFParaAramid Universal 38 in Overall Lg',
  'Pre_Processing': 'national safety apparel aluminized chaps universal 38 overall lg'},
 {'Title': 'STEEL GRIP Pbi/Rayon Half Face Open Hood',
  'Remove_Punctuation': 'STEEL GRIP PbiRayon Half Face Open Hood',
  'Pre_Processing': 'steel grip half face open hood'},
 {'Title': 'QUANTUM Wire Shelf: 36 in x 30 in, 1 Shelves, Quick-Adjust, 800 lb Load Capacity, Dry/Wet, Black',
  'Remove_Punctuation': 'QUANTUM Wire Shelf 36 in x 30 in 1 Shelves QuickAdjust 800 lb Load Capacity DryWet Black'

### Vector Embedding using Traditional Methods

In [None]:
print(text_to_vector.get_sklearn_embedding.__doc__)


        Convert a list of texts into embeddings using a selected vectorization method and dimensionality reduction method.

        Args:
        - texts (list): List of strings representing documents.
        - custom_max_features (int), default = 5000: Maximum number of features (terms) to be used in vectorization.
        - custom_dtype (np.dtype), default = np.float64: Data type of the matrix created by vectorization.
        - is_lower (bool), default = True: Convert text to lowercase.
        - use_stop_words (bool), default = True: Whether to use English stop words during vectorization.
        - custom_ngram_range (tuple), default = (1,1): Range for n-grams to be extracted.
          For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.
        - vectorizer (str), default = 'tfidf': Vectorization method to use.
          Options: 'tfidf','count','binary'.
        - reduction_method (str), default = 'svd': Di

In [None]:
text_to_vector.get_sklearn_embedding(texts = [test_sample],embed_size = 1).shape

(1, 1)

In [None]:
text_to_vector.get_sklearn_embedding(texts = test_file['Title'].tolist(),embed_size = 5).shape

(12, 5)

### Vector Embedding using Word2Vec Models

In [None]:
print(text_to_vector.get_word_embedding.__doc__)


        Compute word embeddings using a pre-trained word embedding model.

        Args:
        - texts (list): List of strings representing documents.
        - model_name: Pre-trained word embedding model.

        Returns:
        - np.ndarray: Embeddings of the input texts.
        


In [None]:
text_to_vector.get_word_embedding(texts = [test_sample],model_name = word_embedding_model).shape

(1, 50)

In [None]:
text_to_vector.get_word_embedding(texts = test_file['Title'].tolist(),model_name = word_embedding_model).shape

(12, 50)

### Vector Embedding using Laser Embedding

In [None]:
print(text_to_vector.get_laser_embeddings.__doc__)


        Compute LASER (Language-Agnostic SEntence Representations) embeddings for a list of texts.

        Args:
        - texts (list): List of strings representing documents.

        Returns:
        - np.ndarray: LASER Embeddings of the input texts.
        


In [None]:
text_to_vector.get_laser_embeddings(texts = [test_sample]).shape

(1, 1024)

In [None]:
text_to_vector.get_laser_embeddings(texts = test_file['Title'].tolist()).shape

(12, 1024)

### Vector Embedding using spaCy Model

In [None]:
print(text_to_vector.get_spacy_embedding.__doc__)


        Compute spaCy word embeddings for a list of texts.

        Args:
        - texts (list): List of strings representing documents.

        Returns:
        - np.ndarray: spaCy embeddings of the input texts.
        


In [None]:
text_to_vector.get_spacy_embedding(texts = [test_sample],nlp_spacy = nlp_spacy_model).shape

(1, 96)

In [None]:
text_to_vector.get_spacy_embedding(texts = test_file['Title'].tolist(),nlp_spacy = nlp_spacy_model).shape

(12, 96)

### Vector Embedding using Sentence Transformer Model

In [None]:
print(text_to_vector.get_sentence_transformers_embedding.__doc__)


        Compute Sentence Transformers embeddings for a list of texts.

        Args:
        - texts (list): List of strings representing documents.

        Returns:
        - np.ndarray: Sentence Transformers embeddings of the input texts.
        


In [None]:
text_to_vector.get_sentence_transformers_embedding(texts = [test_sample]).shape

(1, 384)

In [None]:
text_to_vector.get_sentence_transformers_embedding(texts = test_file['Title'].tolist()).shape

(12, 384)

### Vector Embedding using Pre-Trained Model

In [None]:
print(text_to_vector.get_pre_trained_models_embedding.__doc__)


        Compute embeddings using a pre-trained transformer model (e.g.,BERT) and tokenizer.

        Args:
        - texts (list): List of strings representing documents.
        - model_name: Pre-trained transformer model instance.
        - model_tokenizer: Pre-trained tokenizer instance.
        - custom_max_length (int), default = 256: Maximum Length before truncation.

        Returns:
        - np.ndarray: Embeddings of the input texts using the pre-trained model.
        


In [None]:
text_to_vector.get_pre_trained_models_embedding(texts = [test_sample],model_name = hf_model,model_tokenizer = hf_tokenizer,custom_max_length = 256).shape

(1, 768)

In [None]:
text_to_vector.get_pre_trained_models_embedding(texts = test_file['Title'].tolist(),model_name = hf_model,model_tokenizer = hf_tokenizer,custom_max_length = 256).shape

(12, 768)

### Cleaning RAM

In [None]:
# Flushes all the variables and clears the memory
custom_ram_cleanup_func()
del custom_ram_cleanup_func