<a href="https://colab.research.google.com/github/prashanth-acsq/Colab-Notebooks/blob/main/HuggingFace_(all_MiniLM_L6_v2)_My_Inference_Archive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Setup**

In [1]:
%%bash
pip install transformers -q
pip install gensim -q
pip install nltk -q

### **Imports**

In [2]:
import re
import torch
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel
from gensim.parsing.preprocessing import remove_stopwords

### **Helpers**

In [3]:
def breaker(num: int=50, char: str="*") -> None:
    '''
        Line Breaker
    '''
    print("\n" + 50*"*" + "\n")


def get_model(model_id: str) -> tuple:
    '''
        Load the model from HuggingFace Hub

        1. model_id : Name of the model repository
    '''

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModel.from_pretrained(model_id)

    return tokenizer, model


def mean_pooling(model_output, attention_mask):
    '''
        Mean Pooling - Take attention mask into account for correct averaging

        1. model_output
        2. attention_mask 
    '''

    # First element of model_output contains all token embeddings
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_sentence_embeddings(model, tokenizer, query: list) -> torch.Tensor:
    '''
        Calculate sentence embeddings (Removes stop words from the Knowledge Base items or the Query)

        1. model     : Sentence Transformer Model 
        2. tokenizer : Sentence Tokenizer
        3. query     : List of Sentences (Knowledge Base) or Single Sentence (Query)
    '''

    if isinstance(query, list):
        # Convert all to lowercase and remove all the stopwords from the knowledge base
        query = [kb_item.lower() for kb_item in query]
        query = [remove_stopwords(kb_item) for kb_item in query]
    else:
        # Convert all to lowercase and remove all the stopwords from the query
        query = query.lower()
        query = remove_stopwords(query)

    # Tokenize sentences
    encoded_query = tokenizer(query, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad(): model_output = model(**encoded_query)

    # Perform pooling
    query_sentence_embeddings = mean_pooling(model_output, encoded_query['attention_mask'])

    # Normalize embeddings
    query_sentence_embeddings = F.normalize(query_sentence_embeddings, p=2, dim=1)

    # Return 
    return query_sentence_embeddings


def chat(model, tokenizer, kb_embeddings: torch.Tensor, answer: str, min_confidence: float=0.9) -> None:
    '''
        Continuously chat until user types in 'n' when asked to continue

        1. model          : Sentence Transformer Model 
        2. tokenizer      : Sentence Tokenizer
        3. kb_embeddings  : Knowledge Base Embeddings
        4. answer         : Answer to be given in case the user query matches the knowledge base
        5. min_confidence : Mininum confidence required for user query embedding to match knowledge base embedding
    '''

    while True:
        breaker()
        my_query = input("Enter Query : ")
        query_embedding = get_sentence_embeddings(model, tokenizer, my_query)

        for embedding in kb_embeddings:
            if torch.nn.CosineSimilarity(dim=0)(embedding.squeeze(), query_embedding.squeeze()).item() > min_confidence:
                print(f"\n{answer}")
                break
            else:
                print(f"\nSorry, didn't understand what you meant")
                break

            
        if input("\nContinue (y or n) : ") == "y": pass
        else: break
    breaker()


def match_key(query_embedding: torch.Tensor, kb_embeddings: dict, min_confidence: float=0.9) -> str:
    '''
        Essentially identifies the intent of the user query

        1. query_embedding : Embedding of the user input
        2. kb_embeddings   : Knowledge Base Embeddings (Is a dictonary containing embeddings for each key in the Knowledge Base)
        3. min_confidence  : Mininum confidence required for user query embedding to match knowledge base embedding
    '''

    for key, value in kb_embeddings.items():
        for v in value:
            if torch.nn.CosineSimilarity(dim=0)(v, query_embedding.squeeze()).item() > min_confidence:
                return key

### **Load Tokenizer & Model**

In [4]:
tokenizer, model = get_model('sentence-transformers/all-MiniLM-L6-v2')

Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

In [7]:
query = "The book tells the stories of two men born worlds apart. They have nothing in common except the same date of birth (18 April 1906) and a zeal to succeed in life. William Lowell Kane is a wealthy and powerful elite class while Abel Rosnovski (originally named Władek Koskiewicz) is a Pole who was born in a situation of great poverty and eventually migrated to the United States."
embedding = get_sentence_embeddings(model, tokenizer, query.split("."))
embedding.shape

torch.Size([4, 384])

### **Open Account**

In [None]:
# # Knowledge Base
# open_account_kb =[
#     "How do I open an account?",
#     "How do you open an account?",
#     "How to open an account",
#     "How to open an account?",
#     "How open an account",
#     "How open an account?",  
#     "Account Opening",
#     "Open Savings Account",
#     "Open Checking Account",
#     "How do I open a savings account"
#     "How to open a savings account"
#     "How do I open a checking account"
#     "How to open a checking account"
# ]              

# # Get Sentence Embeddings
# open_account_kb_embeddings = get_sentence_embeddings(model, tokenizer, open_account_kb)

In [None]:
# answer = "To open an account, go to your nearest branch and submit the required documents"
# chat(model, tokenizer, open_account_kb_embeddings, answer, 0.7)


**************************************************

Enter Query : 

Sorry, didn't understand what you meant

Continue (y or n) : 

**************************************************



### **Close Account**

In [None]:
# # Knowledge Base
# close_account_kb = [
#     "How do I close an account?",
#     "How do you close an account?",
#     "How to close an account",
#     "How to close an account?",
#     "How close an account",
#     "How close an account?",  
#     "Account Closing",
#     "Close Savings Account",
#     "Close Checking Account",
#     "How do I close a checking account"
#     "How to close a checking account"
# ]              

# # Get Sentence Embeddings
# close_account_kb_embeddings = get_sentence_embeddings(model, tokenizer, close_account_kb)

In [None]:
# answer = "To close an account, go to your nearest branch and submit the required documents"
# chat(model, tokenizer, close_account_kb_embeddings, answer, 0.7)


**************************************************

Enter Query : 

Sorry, didn't understand what you meant

Continue (y or n) : 

**************************************************



### **Combined**

In [None]:
# kb = {
#     "acc_open" : [
#         "How do I open an account?",
#         "How do you open an account?",
#         "How to open an account",
#         "How to open an account?",
#         "How open an account",
#         "How open an account?",  
#         "Account Opening",
#         "Open Savings Account",
#         "Open Checking Account",
#         "How do I open a savings account"
#         "How to open a savings account"
#         "How do I open a checking account"
#         "How to open a checking account"
#         ],                
#     "acc_close" : [
#         "How do I close an account?",
#         "How do you close an account?",
#         "How to close an account",
#         "How to close an account?",
#         "How close an account",
#         "How close an account?",  
#         "Account Closing",
#         "Close Savings Account",
#         "Close Checking Account",
#         "How do I close a checking account"
#         "How to close a checking account"
#         ]                
# }

# answers = {
#     "acc_open"  : "To open an account, go to your nearest branch and submit the required documents",
#     "acc_close" : "To close an account, go to your nearest branch and submit the required documents",
# }


# kb_embeddings = dict()
# for k, v in kb.items(): kb_embeddings[k] = get_sentence_embeddings(model, tokenizer, v)

# while True:
#     breaker()
#     my_query = input("Enter Query : ")
#     query_embedding = get_sentence_embeddings(model, tokenizer, my_query)

#     key = match_key(query_embedding, kb_embeddings, 0.8)
#     print(f"\n{answers[key]}") if key is not None else print(f"\nSorry, didn't understand what you meant")

#     if input("\nContinue (y or n) : ") == "y": pass
#     else: break

# breaker()


**************************************************

Enter Query : Acc Open

Sorry, didn't understand what you meant

Continue (y or n) : n

**************************************************



### **Train**

In [None]:
import numpy as np

from sklearn.ensemble import RandomForestClassifier

In [None]:
kb = {
    "acc_open" : [
        "How do I open an account?",
        "How do you open an account?",
        "How to open an account",
        "How to open an account?",
        "How open an account",
        "How open an account?",  
        "Account Opening",
        "Open Savings Account",
        "Open Checking Account",
        "How do I open a savings account"
        "How to open a savings account"
        "How do I open a checking account"
        "How to open a checking account"
    ],                
    "acc_close" : [
        "How do I close an account?",
        "How do you close an account?",
        "How to close an account",
        "How to close an account?",
        "How close an account",
        "How close an account?",  
        "Account Closing",
        "Close Savings Account",
        "Close Checking Account",
        "How do I open a savings account"
        "How to open a savings account"
        "How do I close a checking account"
        "How to close a checking account"
    ],  
}

answers = {
    "acc_open"  : "To open an account, go to your nearest branch and submit the required documents",
    "acc_close" : "To close an account, go to your nearest branch and submit the required documents",
}

kb_embeddings = dict()
for k, v in kb.items(): kb_embeddings[k] = get_sentence_embeddings(model, tokenizer, v)

In [None]:
kb_embeddings_np = torch.cat((kb_embeddings["acc_open"], kb_embeddings["acc_close"]), dim=0).detach().cpu().numpy()
labels = np.concatenate((np.zeros(len(kb_embeddings["acc_open"]),), np.ones(len(kb_embeddings["acc_close"]), )), axis=0)

rfc_model = RandomForestClassifier(random_state=42).fit(kb_embeddings_np, labels)

In [None]:
while True:
    breaker()
    my_query = input("Enter Query : ")

    prediction = rfc_model.predict(get_sentence_embeddings(model, tokenizer, my_query))[0]

    if prediction == 0: print(answers["acc_open"])
    elif prediction == 1: print(answers["acc_close"])

    if input("\nContinue (y or n) : ") == "y": pass
    else: break

breaker()


**************************************************

Enter Query : Start
To open an account, go to your nearest branch and submit the required documents

Continue (y or n) : y

**************************************************

Enter Query : Stop
To close an account, go to your nearest branch and submit the required documents

Continue (y or n) : y


### **Train with NLTK corpus**

In [None]:
import nltk

nltk.download('brown')

from nltk.corpus import brown

In [None]:
import numpy as np

from sklearn.ensemble import RandomForestClassifier

In [None]:
kb = {
    "acc_open" : [
        "How do I open an account?",
        "How do you open an account?",
        "How to open an account",
        "How to open an account?",
        "How open an account",
        "How open an account?",  
        "Account Opening",
        "Open Savings Account",
        "Open Checking Account",
        "How do I open a savings account"
        "How to open a savings account"
        "How do I open a checking account"
        "How to open a checking account"
    ],                
    "acc_close" : [
        "How do I close an account?",
        "How do you close an account?",
        "How to close an account",
        "How to close an account?",
        "How close an account",
        "How close an account?",  
        "Account Closing",
        "Close Savings Account",
        "Close Checking Account",
        "How do I open a savings account"
        "How to open a savings account"
        "How do I close a checking account"
        "How to close a checking account"
    ],
      "others" : [
        " ".join(brown.sents()[i]) for i in range(100)
    ] + ["account account"]    
}

answers = {
    "acc_open"  : "To open an account, go to your nearest branch and submit the required documents",
    "acc_close" : "To close an account, go to your nearest branch and submit the required documents",
    "others" : "Sorry, could not understand what you typed"
}

kb_embeddings = dict()
for k, v in kb.items(): kb_embeddings[k] = get_sentence_embeddings(model, tokenizer, v)

In [None]:
kb_embeddings_np = torch.cat((kb_embeddings["acc_open"], kb_embeddings["acc_close"], kb_embeddings["others"]), dim=0).detach().cpu().numpy()
labels = np.concatenate((np.zeros(len(kb_embeddings["acc_open"]),), np.ones(len(kb_embeddings["acc_close"]), ), np.ones(len(kb_embeddings["others"]), )*2), axis=0)

rfc_model = RandomForestClassifier(random_state=42).fit(kb_embeddings_np, labels)

In [None]:
while True:
    breaker()
    my_query = input("Enter Query : ")

    prediction = rfc_model.predict(get_sentence_embeddings(model, tokenizer, my_query))[0]

    if prediction == 0: print(answers["acc_open"])
    elif prediction == 1: print(answers["acc_close"])
    elif prediction == 2: print(answers["others"])

    if input("\nContinue (y or n) : ") == "y": pass
    else: break

breaker()


**************************************************

Enter Query : account account
Sorry, could not understand what you typed

Continue (y or n) : y

**************************************************

Enter Query : Account 
Sorry, could not understand what you typed

Continue (y or n) : y

**************************************************

Enter Query : Account Open
To open an account, go to your nearest branch and submit the required documents

Continue (y or n) : y

**************************************************

Enter Query : Account Start
Sorry, could not understand what you typed

Continue (y or n) : y

**************************************************

Enter Query : Start
Sorry, could not understand what you typed

Continue (y or n) : n

**************************************************



In [None]:
import re

In [None]:
data_list = [
    "When is Telephone Banking available?",
    "How can I register for Telephone Banking and receive my TPIN?",
    "Do I need to identify and authenticate myself every time I call Commercial Bank?",
    "What is the benefit of TPIN other than the IVR services?",
    "If I call from overseas, will I be able to speak to a customer service representative immediately?",
    "Is activation of cards through IVR fast?",
    "How can I retrieve my TPIN if I forgot it?",
    "What is the maximum amount that I can pay for my credit card through IVR?",
    "How long will it take to avail the funds when I make a credit card payment through IVR?",
]

for i in range(len(data_list)):
    data_list[i] = data_list[i].lower()
    data_list[i] = data_list[i].replace("?", "")
    data_list[i] = remove_stopwords(data_list[i])
data_list

['telephone banking available',
 'register telephone banking receive tpin',
 'need identify authenticate time commercial bank',
 'benefit tpin ivr services',
 'overseas, able speak customer service representative immediately',
 'activation cards ivr fast',
 'retrieve tpin forgot',
 'maximum pay credit card ivr',
 'long avail funds credit card payment ivr']