> Reference: https://pub.towardsai.net/knn-k-nearest-neighbors-is-dead-fc16507eb3e

In [1]:

import numpy as np
import pandas as pd
import time
import re
from datetime import datetime
import hnswlib
import _pickle as cPickle
import string


In [2]:
def clean_text(text):
    """
    Preprocess textual data by performing the following steps:
    1. Remove punctuation
    2. Convert text to lowercase
    3. Remove digits and special characters
    4. Remove extra whitespaces
    """
    if not isinstance(text, (str, bytes)):
        return text
    
    # remove links
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove digits and special characters
    text = re.sub(r'\d+', '', text)
    
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


## Reference function to build the index for HNSW




In [3]:
def fit_hnsw_index(features, ef=100, M=16):
    # Get the number of elements in the feature set
    num_elements = len(features)
    # Create an array of labels for the elements in the feature set
    labels_index = np.arange(num_elements)

    # Get the dimensionality of the features
    EMBEDDING_SIZE = len(features[0])

    # Initialize the hnsw index with the specified parameters
    p = hnswlib.Index(space='l2', dim=EMBEDDING_SIZE)
    p.init_index(max_elements=num_elements, ef_construction=ef, M=M)

    # Add the features to the index and get the integer labels
    int_labels = p.add_items(features, labels_index)

    # Set the ef parameter for the index
    p.set_ef(ef) 

    # Return the hnsw index
    return p


In [4]:
with open("embeddings_dict.pkl", "rb") as f:
    embeddings_dict = cPickle.load(f)

In [5]:


def generate_embedding(sentence, embeddings_dict = embeddings_dict):
    # split the sentence into tokens
    tokens = re.findall(r"[\w']+|[^\w\s]", sentence, re.UNICODE)
    
    # create a list to store the embeddings for each token
    embeddings = [embeddings_dict[lang_code][token] for token in tokens for lang_code in embeddings_dict.keys() if token in embeddings_dict[lang_code]]
    
    # create a 300d vector by taking the average of the embeddings
    vector = np.mean(np.array(embeddings), axis=0) if embeddings else np.zeros(300)
    
    return vector


In [6]:
import math

In [43]:
cosine_threshold = 0.995
l2_threshold = math.sqrt(2 * (1 - cosine_threshold))
l2_threshold

0.10000000000000005

In [37]:
# Define a function named "bulkify" that takes three arguments:
# 1. "df": a pandas DataFrame containing two columns named "reg_no" and "subject_content"
# 2. "l2_threshold" (default value: 0.005): a float representing the L2 threshold distance
# 3. "min_docs" (default value: 25): an integer representing the minimum number of documents in a bulk
def bulkify(df, l2_threshold=l2_threshold, min_docs=5):
    # Set K to the minimum of the length of the DataFrame and 1000
    K=min(len(df)-1, 1000)
    
    # Record the start time
    start = time.time()
    
    # Rename the columns of the DataFrame to "reg_no" and "subject_content"
    df.columns = ['reg_no', 'subject_content']
    
    # Apply a function called "clean_text" to the "subject_content" column and store the result in a new column named "subject_content_cleaned"
    df['subject_content_cleaned'] = df['subject_content'].apply(clean_text)
    
    # Replace empty strings in the "subject_content_cleaned" column with NaN values
    df['subject_content_cleaned'].replace('', np.nan, inplace=True)
    
    # Remove rows with NaN values in the "subject_content_cleaned" column
    df.dropna(subset=['subject_content_cleaned'], inplace=True)
    
    # Reset the index of the DataFrame
    df.reset_index(drop=True, inplace=True)

    # Generate embeddings for each text in the "subject_content_cleaned" column and stack them vertically
    emb = np.vstack([generate_embedding(s) for s in df['subject_content_cleaned']])

    # Print the elapsed time since the start of the function
    print(f'Embedding Time: {time.time() - start}')

    # Fit a Hierarchical Navigable Small World (HNSW) index to the embeddings with an exploration factor (ef) of K*10
    p = fit_hnsw_index(emb, ef=K*10)
    
    # Find the K nearest neighbors and their distances for each embedding using the HNSW index
    ann_neighbor_indices, ann_distances = p.knn_query(emb, K)
    
    # Print the elapsed time since the start of the function
    print(f'HNSW Time: {time.time() - start}')

    # Calculate the L2 distance threshold based on the given cosine similarity threshold (not used in this implementation)
    # cos_sim_threshold = cos_sim_param
    # l2_threshold = np.sqrt(2 * (1 - cos_sim_threshold))
    
    # Create a dictionary where each key is an index of an embedding and its value is a list of indices of its nearest neighbors within the L2 distance threshold
    primary_dict = {i: [k for k, v in zip(indices, distances) if k != i and v < l2_threshold]
                    for i, (indices, distances) in enumerate(zip(ann_neighbor_indices, ann_distances))}

    # Create an empty list and set to store all the bulk sets and indices that have been added to a bulk
    bulk_all = []
    bulk_set = set()

    # loop through the primary documents in the primary_dict
    for i in range(len(primary_dict)):
        # skip documents already included in a bulk
        if i in bulk_set:
            continue
        # add the current primary document to the tmp_docs list
        tmp_docs = [df.loc[i, 'reg_no']]
        # loop through the similar documents for the current primary document
        for k in sorted(primary_dict[i]):
            # skip documents already included in a bulk
            if k in bulk_set:
                continue
            # add the current similar document to the tmp_docs list
            tmp_docs.append(df.loc[k, 'reg_no'])
            # add the current similar document to the bulk_set
            bulk_set.add(k)
        # if the tmp_docs list contains more than min_docs documents, add it to the bulk_all list
        if len(tmp_docs) >= min_docs:
            bulk_all.append(tmp_docs)

    # initialize a dictionary to store the bulk status for each document
    out = {reg: "" for reg in df['reg_no'].values}
    # print the length of the dictionary
    print(len(out))

    # Loop through each list in the input list of lists
    for i in range(len(bulk_all)):
        
        # Get the current list
        curr_bulk = bulk_all[i]
        
        # Sort the current list in ascending order
        curr_bulk = sorted(curr_bulk)
        
        # Get the first element (minimum value) of the sorted list
        first_id = curr_bulk[0]
        
        # Loop through each element in the current list
        for r in curr_bulk:
            
            # Set the output for the current element to a tuple of the first element
            # in the sorted list (first_id) and the length of the current list (len(curr_bulk))
            out[r] = (first_id, len(curr_bulk))
    
    # Print the total time elapsed during processing
    print(f'Total Time: {time.time() - start}')
    
    # Return the output dictionary
    return out, bulk_all


In [58]:
df_tmp = pd.read_csv(r"bulk-campaign.csv",\
    usecols=[0,7])
df_tmp = df_tmp.sample(n = 10)

out, bulk = bulkify(df_tmp)
len(bulk)

Embedding Time: 0.030653715133666992
HNSW Time: 0.031068086624145508
10
Total Time: 0.03168296813964844


1

In [35]:
# sen = "ਮੈਂ ਆਜ office ਜਾਣਾ ਹੈ, ਸੋ let's ਸਟਾਰਟ ਕਰੀਏ the ਕੰਮ!"
# print(f"Original sentence: {sen}")
# sen = clean_text(sen)
# print(f"Cleaned sentence: {sen}")
# emb = generate_embedding(sen)
# print(f"Length of embedding generated: {len(emb)}")
# print(f"Preview of embedding:\n {emb}")

In [11]:
import imgkit

In [60]:
curr = bulk[0]
print(len(curr))
df_tmp[df_tmp.reg_no.isin(curr)][["reg_no", "subject_content"]].to_csv("Bulk_example1.csv", index = False)

5


In [61]:
# convert the dataframe to an HTML table
html_table = df_tmp[df_tmp.reg_no.isin(curr)][["reg_no", "subject_content"]].to_html(index=False)

# use imgkit to convert the HTML table to an image
imgkit.from_string(html_table, "example1.png")

# convert the dataframe to an HTML table
html_table = df_tmp[["reg_no", "subject_content"]].to_html(index=False)

# use imgkit to convert the HTML table to an image
imgkit.from_string(html_table, "data.png")

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


True

In [None]:
# len(Counter(out.keys())), Counter(out.keys())

In [None]:
# tmp = Counter(out.values())

In [None]:
# tmp.most_common(10)

### Issues

1. A few of the grievances are a code mix of different scripts. The above code can catch only those grievances which are in the same script
2. With using Indic bert model accuracy is up but it is very slow
3. If we wish to use fast text then accuracy dips but it is way faster
4. Can we do something like translation first? Again it is slow