# README!
- Runs contextual learning to get similar records and using this runs iterative clustering to get a final list of similar vectors.
- Uses 'all-distilroberta-v1' for vectorization
- Uses spacy for cleaning and lemmatization
- Uses sparse matrix multiplication for generating similar vetors
- Uses heusristic for number of clusters
- **Just run the flow with your data set, keeping ::config:: updated everywhere!**

# File structure

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports

In [None]:
!pip install xlrd
!pip install xlsxwriter
!pip install ftfy
!pip install pyspellchecker
!pip install sparse_dot_topn==0.2.9

In [None]:
#!python/3.7.5/bin/python

import os
import sys
import json
import re
import io
from io import StringIO
import inspect
import shutil
import ast
import string
import time
import pickle
import glob
import traceback
import multiprocessing
import requests
import logging
import math
import pytz
from retrying import retry
from itertools import chain
from string import Template
from datetime import datetime, timedelta
from collections import defaultdict, OrderedDict
from contextlib import contextmanager
from math import pi
import unicodedata
from collections import defaultdict, Counter
import ntpath
import tqdm
from functools import reduce
import itertools
from spellchecker import SpellChecker

# graph
import networkx as nx

# Standard
import numpy as np
from numpy import array
from numpy import argmax
import pandas as pd
import xlrd
import xlsxwriter
import jsonschema
from fuzzywuzzy import fuzz
from wordcloud import WordCloud

# scikit-learn
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, silhouette_score, homogeneity_score
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering

# scipy
from scipy import spatial, sparse
from scipy.sparse import coo_matrix, vstack, hstack
from scipy.spatial.distance import euclidean, jensenshannon, cosine, cdist
from scipy.io import mmwrite, mmread
from scipy.stats import entropy
from scipy.cluster.hierarchy import dendrogram, ward, fcluster
import scipy.cluster.hierarchy as sch
from scipy.sparse.csr import csr_matrix
from scipy.sparse.lil import lil_matrix
from scipy.sparse.csgraph import connected_components
from typing import Tuple, NamedTuple, List, Optional, Union
from functools import wraps
from sparse_dot_topn import awesome_cossim_topn
import sparse_dot_topn.sparse_dot_topn as ct

# fit text for you (ftfy)
import ftfy

# Gensim
import gensim
from gensim.models import Phrases, Word2Vec, KeyedVectors, FastText, LdaModel
from gensim import utils
from gensim.utils import simple_preprocess
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api
from gensim import models, corpora, similarities

# NLTK
import nltk
from nltk import FreqDist
from nltk import tokenize, sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords, PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import *
from nltk.translate.bleu_score import sentence_bleu
print("nltk loaded.")

# Spacy
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')
from spacy.lang.en import English
print("spacy loaded.")

# torch
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
from transformers import pipeline
from transformers import AutoModel
from typing import Any, Dict, List, Callable, Optional, Tuple, Union
from sklearn.base import BaseEstimator, TransformerMixin
print("pyTorch loaded.")

import warnings
warnings.filterwarnings('ignore')

# Input Data

In [None]:
## Path
resources_dir_path = "/kaggle/input/zenbird-resources/"

In [None]:
## input 

df_parent_org = pd.read_csv(os.path.join(resources_dir_path, "df_parent_common.csv"))
df_adult_org = pd.read_csv(os.path.join(resources_dir_path, "df_adult_common.csv"))

print("data loaded. Shapes = ", df_parent_org.shape, df_adult_org.shape)

In [None]:
def get_cleaned_data(data):
    data = data.fillna("<>")
    data['Corpus'] = data['specifypositive_April2020'] + \
                          ". " + data['specifypositive_May2020'] + \
                          ". " + data['specifypositive_Nov2020'] + \
                          ". " + data['specifypositive_April2021']
    data['Corpus'] = data['Corpus'].apply(lambda x: str(x).replace("<>.", "").replace("<>", "").strip())
    data = data[data.Corpus.apply(lambda x: str(x).strip()=="") == False].reset_index(drop=True)
    data = data.replace("<>.", "").replace("<>", "")
    return data

In [None]:
df_parent = get_cleaned_data(df_parent_org)
df_adult = get_cleaned_data(df_adult_org)

In [None]:
df_parent.head()

In [None]:
df_adult.head()

# Select a dataframe

In [192]:
df = df_adult.reset_index(drop=True).dropna(subset=['Corpus'])
print("Dataset length: ", df.shape)

In [None]:
## CONFIG :: column settings

# chwat-history (master)
unique_id_col = "ID"
text_col = "Corpus"
clean_text_col = "clean_text"
print("Settings set into config.")

# Preprocessing

In [None]:
# question words
wh_words = ["who", "what", "where", "when", "would", "which", "how", "why", "can", "may", "will", "won't", "does", "does not",
            "doesn't", "do", "do i", "do you", "is it", "would you", "is there", "are there", "is it so", "is this true" ,
            "to know", "is that true", "are we", "am i", "question is", "can i", "can we", "tell me", "can you explain",
            "how ain't", "question", "answer", "questions", "answers", "ask", "can you tell"]
# greetings
greeting_file = os.path.join(resources_dir_path, "/kaggle/input/zenbird-resources/greeting_words.txt")
with open(greeting_file, 'r', encoding='utf-8', errors='ignore') as f:
    greeting_words = [eval(x.rstrip())[0] for x in f.readlines()]

# signature
signature_file = os.path.join(resources_dir_path, "/kaggle/input/zenbird-resources/signature_words.txt")
with open(signature_file, 'r', encoding='utf-8', errors='ignore') as f:
    signature_words = [eval(x.rstrip())[0] for x in f.readlines()]

# for marking keywords to retain during pre-processing step
vocab_list = []
vocab_list = wh_words + greeting_words + signature_words
vocab_list = list(map(str.lower, vocab_list))
print("custom vocab loaded. total words =", len(vocab_list))


In [None]:
## Preprocessing
class basic_preprocessing:
    """
    Perform basic pre-processing steps on any textual data. To add customization, pass a list of keyowrds which needs to
    be excluded from lemmatization or stop-word removal. Use  'vocab_list'  param to pass a customized list of keywords!
    """

    def __init__(self, resources_dir_path, vocab_list=[], custom=False):
        self.stopwords_file = os.path.join(resources_dir_path, "stopwords.txt")
        self.special_characters_file = os.path.join(resources_dir_path, "special_characters.txt")
        self.contractions_file = os.path.join(resources_dir_path, "contractions.json")
        self.chatwords_file = os.path.join(resources_dir_path, "chatwords.txt")
        self.emoticons_file = os.path.join(resources_dir_path, "emoticons.json")
        self.greeting_file = os.path.join(resources_dir_path, "greeting_words.txt")
        self.signature_file = os.path.join(resources_dir_path, "signature_words.txt")
        self.vocab_list = vocab_list
        self.custom = custom
        self.load_resources()
        return

    def load_resources(self):
        # stopwords, special_stopwords, special_characters
        with open(self.stopwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords = [x.rstrip() for x in f.readlines()]
        with open(self.special_characters_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords.extend([x.rstrip() for x in f.readlines()])
        self.stopwords = sorted(set(self.stopwords))

        # Vocabulary Dictionary (**custom**)
        # - for marking keywords to retain during pre-processing step
        # - modify spacy's stopword checker
        self.vocab_list += ["CMI", "Child Mind Institute"]
        self.vocab_list = list(map(str.lower, self.vocab_list))
        self.vocab_list = sorted(set(self.vocab_list))
        # modify stop_words list
        self.stopwords = set(self.stopwords).difference(self.vocab_list)
        # custom regex using these vocab
        self.vocab_dict = {w: "_".join(w.split()) for w in self.vocab_list}
        self.regex_custom = re.compile('|'.join(sorted(map(re.escape, self.vocab_dict), key=len, reverse=True)))
        # for not_stopword in self.vocab_dict.values():
        #     nlp.vocab[not_stopword].is_stop = False

        # contractions
        with open(self.contractions_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.contractions = dict(json.load(f))
        # chat-words
        with open(self.chatwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.chat_words_map_dict, self.chat_words_list = {}, []
            chat_words = [x.rstrip() for x in f.readlines()]
            for line in chat_words:
                cw = line.split("=")[0]
                cw_expanded = line.split("=")[1]
                self.chat_words_list.append(cw)
                self.chat_words_map_dict[cw] = cw_expanded
            self.chat_words_list = set(self.chat_words_list)
        # emoticons
        with open(self.emoticons_file, "r") as f:
            self.emoticons = re.compile(u'(' + u'|'.join(k for k in json.load(f)) + u')')
        # emojis
        self.emojis = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        # greetings
        with open(self.greeting_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.greeting_words = [x.rstrip() for x in f.readlines()]
        # signature
        with open(self.signature_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.signature_words = [x.rstrip() for x in f.readlines()]
        # spell-corrector
        self.spell_checker = SpellChecker()
        return

    def custom_cleaning(self, text, reset=False):
        """ *** Perform custom cleaning here... *** """
        # use global variable "regex_custom" in this...
        key = "_"
        text = str(text).strip().lower()
        if reset is False:
            # custom replacement
            text = re.sub(r"directed[\s\,]*share[\s\,]*program(?=[\s\(\[]*dsp[\s\)\]]*)", "", text.lower(), re.IGNORECASE)
            # compile using a dict of words and their expansions, and sub them if found!
            match_and_sub = self.regex_custom.sub(lambda x: self.vocab_dict[x.string[x.start():x.end()]], text)
            return re.sub(r"([\s\n\t\r]+)", " ", match_and_sub).strip()
        else:
            # reverse the change! - use this at the end of preprocessing
            text = text.replace(key, " ")
            return re.sub(r"([\s\n\t\r]+)", " ", text).strip()

    def clean(self, input_sentences):
        cleaned_sentences = []
        for sent in input_sentences:

            # normalize text to "utf-8" encoding
            sent = unicodedata.normalize('NFKD', str(sent)).encode('ascii', 'ignore').decode('utf-8', 'ignore')

            # lowercasing
            sent = str(sent).strip().lower()

            # <---------- CUSTOM CLEANING -------------->
            #
            # Mark imp keywords such as: Domain specific, Question words(wh-words), etc, using "vocabulary".
            # Create a "vocab_dict" first.  This dict shall be used to join these keywords (i.e. join them using "_" ),
            # during pre-processing step, and later un-joined.
            if self.custom:
                sent = self.custom_cleaning(sent, reset=False)
            #
            # <---------- CUSTOM CLEANING -------------->

            # remove Emojis
            sent = self.emojis.sub(r'', sent)

            # remove emoticons
            sent = self.emoticons.sub(r'', sent)

            # remove common chat-words
            sent = " ".join([self.chat_words_map_dict[w.upper()] if w.upper() in self.chat_words_list else w for w in sent.split()])

            # FIX text
            sent = ftfy.fix_text(sent)

            # Normalize accented chars
            sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')

            # Removing <…> web scrape tags
            sent = re.sub(r"\<(.*?)\>", " ", sent)

            # Expanding contractions using contractions_file
            sent = re.sub(r"(\w+\'\w+)", lambda x: self.contractions.get(x.group().lower(), x.group().lower()), sent)

            # Removing web urls
            sent = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0–9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", sent)

            # Removing date formats
            sent = re.sub(r"(\d{4}\-\d{2}\-\d{2}\s\d{2}\:\d{2}\:\d{2}\s\:)", " ", sent)

            # removing punctuations
            # - disable them, when sentence structure needs to be retained.
            sent = re.sub(r"[\$|\#\@\*\%]+\d+[\$|\#\@\*\%]+", " ", sent)
            sent = re.sub(r"\'s", " \'s", sent)
            sent = re.sub(r"\'ve", " \'ve", sent)
            sent = re.sub(r"n\'t", " n\'t", sent)
            sent = re.sub(r"\'re", " \'re", sent)
            sent = re.sub(r"\'d", " \'d", sent)
            sent = re.sub(r"\'ll", " \'ll", sent)
            sent = re.sub(r"[\/,\@,\#,\\,\{,\},\(,\),\[,\],\$,\%,\^,\&,\*,\<,\>]", " ", sent)
            sent = re.sub(r"[\,,\;,\:,\-]", " ", sent)      # main puncts
            # sent = re.sub(r"[\!,\?,\.]", " ", sent)       # sentence delimitters

            # keep only text & numbers
            # - enable them, when only text and numbers matter
            # sent = re.sub(r"\s+", " ", re.sub(r"[\\|\/|\||\{|\}|\[|\]\(|\)]+", " ", re.sub(r"[^A-z0-9]", " ", str(sent))))

            # correct spelling mistakes
            # - enable them when english spelling mistakes matter
            # sent = " ".join([self.spell_checker.correction(w) if w in self.spell_checker.unknown(sent.split()) else w for w in sent.split()])

            # remove 'modifed' self.stop_words
            sent = " ".join(token.text for token in nlp(sent) if token.text not in self.stopwords)

            # lemmatize
            sent = " ".join(token.lemma_ for token in nlp(sent) if token.text not in self.stopwords)

            # Removing extra whitespaces
            sent = re.sub(r"([\s\n\t\r]+)", " ", sent).strip()

            # <---------- CUSTOM CLEANING -------------->
            #
            # revers the custom cleaning step!
            if self.custom:
                sent = self.custom_cleaning(sent, reset=True)
            #
            # <---------- CUSTOM CLEANING -------------->

            cleaned_sentences.append(sent.strip().lower())
        return cleaned_sentences

In [None]:
## data pre-processing
preprocessText = basic_preprocessing(resources_dir_path, vocab_list=vocab_list, custom=True)

# Master data
df[clean_text_col] = preprocessText.clean(df[text_col])
# df[unique_id_col] = ["ch_{}".format(i) for i in range(0, len(df))]
df[clean_text_col] = df[clean_text_col].fillna(value="NONE")
print("pre-processing completed.")

In [None]:
## Generic engineering

df['char_len'] = df[text_col].apply(lambda x: len(str(x)))
df['word_len'] = df[text_col].apply(lambda x: len(str(x).split()))
regunk = re.compile(r"\b({})\b".format("|".join(greeting_words + signature_words)))
df['containsJunkWord'] = df[text_col].apply(lambda x: 1 if re.findall(regunk, x.lower().strip()) else 0)

# Vectorization

In [None]:
from pathlib import Path
import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    TFAutoModel
)

WORKING_DIR = Path("/kaggle/working")
print('Transformers version',transformers.__version__) # Current version: 2.3.0

In [None]:
## Vectorization using Fine-tuned Sentence BERT (3dr corpus 11k documents)

class BertTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, tokenizer, model, max_length=128, embedding_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,):
        self.tokenizer = tokenizer
        self.model = model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func
        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text):
        # Mean Pooling - Take attention mask into account for correct averaging
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0] #First element of model_output contains all token embeddings
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            return sum_embeddings / sum_mask

        # Tokenize the text with the provided tokenizer
        encoded_input = tokenizer(text, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')

        # Compute token embeddings
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        # Perform mean pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

        # bert takes in a batch so we need to unsqueeze the rows
        return sentence_embeddings

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()
        return self._tokenize(text)

    def fit(self, X, y=None):
        """No fitting required so we just return ourselves. For fine-tuning, refer to shared gpu-code!"""
        return self

In [None]:
# BERT VECTORIZATION
m_fp = 'sentence-transformers/all-distilroberta-v1'

# load tokenizer, model classes
tokenizer = AutoTokenizer.from_pretrained(m_fp)
model_bert = AutoModel.from_pretrained(m_fp)

# load vectorizer
bert_vectorizer = BertTransformer(tokenizer, model_bert, embedding_func=lambda x: x[0][:, 0, :].squeeze())
print("Vectorization class loaded.")

In [None]:
bert_vectorizer

# Contextual simialrity

In [None]:
# Textual + Semantic Similarity comparison analysis
class Generate_Similarity_Matrix(object):

    def __init__(self, master, duplicates=None, master_id=None, duplicates_id=None, min_similarity=0.80, vectorizer='tfidf'):
        # UTILITY FUNCTIONS
        def _is_series_of_strings(series_to_test: pd.Series):
            if not isinstance(series_to_test, pd.Series): return False
            elif series_to_test.to_frame().applymap(lambda x: not isinstance(x, str)).squeeze(axis=1).any(): return False
            return True
        def _is_input_data_combination_valid(duplicates, master_id, duplicates_id):
            if duplicates is None and (duplicates_id is not None) or duplicates is not None and ((master_id is None) ^ (duplicates_id is None)): return False
            else: return True

        # VALIDATE INPUT ARGS
        if not _is_series_of_strings(master) or (duplicates is not None and not _is_series_of_strings(duplicates)):
            raise TypeError('Input does not consist of pandas.Series containing only Strings')
        if not _is_input_data_combination_valid(duplicates, master_id, duplicates_id):
            raise Exception('List of data Series options is invalid')
        if master_id is not None and len(master) != len(master_id):
            raise Exception('Both master and master_id must be pandas.Series of the same length.')
        if duplicates is not None and duplicates_id is not None and len(duplicates) != len(duplicates_id):
            raise Exception('Both duplicates and duplicates_id must be pandas.Series of the same length.')

        # SAVE INPUT ARGS
        self._master = master
        self._duplicates = duplicates if duplicates is not None else None
        self._master_id = master_id if master_id is not None else None
        self._duplicates_id = duplicates_id if duplicates_id is not None else None
        self.min_similarity = min_similarity
        self.vectorizer_name = vectorizer     # tfidf, bert

        # CONFIG
        self._true_max_n_matches = None
        self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates)
        self.ngram_size = 3
        self.regex = r'[,-./]|\s'
        self.number_of_processes = multiprocessing.cpu_count() - 1
        self.DEFAULT_COLUMN_NAME = 'side'
        self.DEFAULT_ID_NAME = 'id'
        self.LEFT_PREFIX = 'left_'
        self.RIGHT_PREFIX = 'right_'
        self._matches_list = pd.DataFrame()
        self.is_build = False  # indicates if fit has been called or not

        # -- INIT VECTORIZER --
        if self.vectorizer_name=="tfidf":
            def get_n_grams(string):
                if string is not None: string = string.lower()    # lowercasing all str
                string = re.sub(self.regex, r'', string)
                n_grams = zip(*[string[i:] for i in range(self.ngram_size)])
                return [''.join(n_gram) for n_gram in n_grams]
            # - enable fit() in "_get_tf_idf_matrices(self)"
            self._vectorizer = TfidfVectorizer(min_df=1, analyzer=get_n_grams, dtype=np.float64)
        if self.vectorizer_name=="bert":
            self._vectorizer = BertTransformer(tokenizer, model_bert, embedding_func=lambda x: x[0][:, 0, :].squeeze())
        # -- INIT VECTORIZER --
        return

    def fit(self):
        """
        Fit a vectorizer (already init) with Master & Duplicates matrix and calculate cosine-sim without original-ids.
        Params  : Master, Duplicates
        Return  : dataframe{ Master_Text, Duplicates_Text, cosine_sim(vectorizer_master, vectorized_duplicates) }

        """
        # UTILITY FUNCTIONS
        def fix_diagonal(m: lil_matrix):
            r = np.arange(m.shape[0])
            m[r, r] = 1
            return m
        def symmetrize_matrix(m_symmetric: lil_matrix):
            r, c = m_symmetric.nonzero()
            m_symmetric[c, r] = m_symmetric[r, c]
            return m_symmetric

        # Vectorize the matrices
        # - if duplicate matrix is present use it, else utilize master itself
        master_matrix, duplicate_matrix = self.get_vectorized_matrices()

        # Calculate cosine similarity b/w master & duplicates (if passed, else use master itself)
        matches = self.build_matches(master_matrix, duplicate_matrix)
        self._true_max_n_matches = self._max_n_matches-1

        # Correct sparse matrix multiplcation
        if self._duplicates is None:
            # convert to lil format for best efficiency when setting matrix-elements
            # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by floating-point computations
            #                                             in awesome_cossim_topn sometimes lead to unexpected results)
            matches = matches.tolil()
            matches = fix_diagonal(matches)
            if self._max_n_matches < self._true_max_n_matches:
                matches = symmetrize_matrix(matches)
            matches = matches.tocsr()

        # Create the basic "matches" dataframe with "Master, Duplicate and Similarity" cols only
        r, c = matches.nonzero()
        self._matches_list = pd.DataFrame({'master_side': r.astype(np.int64), 'dupe_side': c.astype(np.int64), 'similarity': matches.data})
        self.is_build = True
        return self

    def get_vectorized_matrices(self):
        """
        Vectorize matrices using one of the vectorizers.
        Params    : Master, Duplicates, Vectorizer_name("tfidf", "bert")
        Return    : vectorizer_master, vectorized_duplicates
        """
        def fit_vectorizer():
            # if both master & duplicates series are set - concat them to fit the vectorizer on all strings at once!
            if self._duplicates is not None:
                strings = pd.concat([self._master, self._duplicates])
            else:
                strings = self._master
            self._vectorizer.fit(strings)
            return self._vectorizer

        if self.vectorizer_name=="tfidf":
            print("tfidf")
            self._vectorizer = fit_vectorizer()
            master_matrix = self._vectorizer.transform(self._master)
            if self._duplicates is not None:
                duplicate_matrix = self._vectorizer.transform(self._duplicates)
            else:
                # IF there is no duplicate matrix, match on the master matrix itself!
                duplicate_matrix = master_matrix

        if self.vectorizer_name=="bert":
            print("bert")
            master_matrix = self._vectorizer.transform(self._master)
            # --> Convert Tensor Matrices to CSR (np.float64)
            master_matrix = csr_matrix( F.normalize(master_matrix).numpy().astype(np.float64) )
            if self._duplicates is not None:
                duplicate_matrix = self._vectorizer.transform(self._duplicates)
                duplicate_matrix = csr_matrix( F.normalize(duplicate_matrix).numpy().astype(np.float64) )
            else:
                # IF there is no duplicate matrix, match on the master matrix itself!
                duplicate_matrix = master_matrix

        return master_matrix, duplicate_matrix

    def build_matches(self, master_matrix, duplicate_matrix):
        """
        Builds the cosine similarity matrix of two CSR matrices.
        Params   : vectorizer_master, vectorized_duplicates
        Return   : cosine_sim(vectorized_master, vectorized_duplicates)
        """
        # Matrix A, B
        tf_idf_matrix_1 = master_matrix
        tf_idf_matrix_2 = duplicate_matrix.transpose()

        # Calculate cosine similarity
        optional_kwargs = {'use_threads': self.number_of_processes > 1, 'n_jobs': self.number_of_processes}
        cosine_sim_matrix = awesome_cossim_topn(tf_idf_matrix_1,
                                                tf_idf_matrix_2,
                                                self._max_n_matches,
                                                self.min_similarity,
                                                **optional_kwargs)
        return cosine_sim_matrix

    def get_matches(self):
        """
        Creates the complete dataframe with index matching(ids) if passed.
        Params  : dataframe
        Return  : dataframe{ Master_ids, Master_Text, cosine_similarity, Duplicate_ids, Duplicates_Text }
        """
        # UTILITY FUNCTIONS
        def get_both_sides(master, duplicates, generic_name=(self.DEFAULT_COLUMN_NAME, self.DEFAULT_COLUMN_NAME), drop_index=False):
            lname, rname = generic_name
            left = master if master.name else master.rename(lname)
            left = left.iloc[matches_list.master_side].reset_index(drop=drop_index)
            if self._duplicates is None:
                right = master if master.name else master.rename(rname)
            else:
                right = duplicates if duplicates.name else duplicates.rename(rname)
            right = right.iloc[matches_list.dupe_side].reset_index(drop=drop_index)
            return left, (right if isinstance(right, pd.Series) else right[right.columns[::-1]])
        def prefix_column_names(data, prefix):
            if isinstance(data, pd.DataFrame):
                return data.rename(columns={c: f"{prefix}{c}" for c in data.columns})
            else:
                return data.rename(f"{prefix}{data.name}")

        if self.min_similarity > 0:
            matches_list = self._matches_list
        else:
            raise Exception("min_similarity cannot be set to less than or equal to 0!")

        # ID Retrival
        left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=False)
        similarity = matches_list.similarity.reset_index(drop=True)
        if self._master_id is None:
            # if ids are not passed
            return pd.concat([prefix_column_names(left_side, self.LEFT_PREFIX),
                              similarity,
                              prefix_column_names(right_side, self.RIGHT_PREFIX)], axis=1)

        else:
            # if ids are passed, retrive ids
            left_side_id, right_side_id = get_both_sides(self._master_id, self._duplicates_id, (self.DEFAULT_ID_NAME, self.DEFAULT_ID_NAME), drop_index=True)
            return pd.concat([prefix_column_names(left_side, self.LEFT_PREFIX),
                              prefix_column_names(left_side_id, self.LEFT_PREFIX),
                              similarity,
                              prefix_column_names(right_side_id, self.RIGHT_PREFIX),
                              prefix_column_names(right_side, self.RIGHT_PREFIX)], axis=1)

def match_strings(master, duplicates=None, master_id=None, duplicates_id=None, min_similarity=0.80, vectorizer=None):
    """ Find pair-wise similarity b/w Master & Duplicate Matrices. """
    if vectorizer and vectorizer.lower().strip() in ['tfidf', 'bert']:
        vectorizer=vectorizer.lower().strip()
        cos_sim_matrix = Generate_Similarity_Matrix(master, duplicates=duplicates, master_id=master_id, duplicates_id=duplicates_id, min_similarity=min_similarity, vectorizer=vectorizer)
        cos_sim_matrix.fit()                     # run vectorizer & generate basic pair-wise cosine sim matrix
        sim_df = cos_sim_matrix.get_matches()    # add ids if passed to sim matrix
        return sim_df
    else:
        raise Exception("Vectorizer is not passed or incorrect! Please select one: [tfidf, bert]")

#### Run Similarity Computation on 1 master file

In [None]:
def run_Similarity_on_MasterFile(df, master_min_similarity=0.75):

    # 1. collect duplicate ids based on "text" col
    def collect_dups(data, id_col, dup_col, output_col_name):
        dup_dict = data.reset_index().groupby(data[dup_col].tolist())[id_col].agg(list).reset_index().reset_index(drop=True).rename(columns={"index": dup_col, id_col: output_col_name})
        dup_dict = dup_dict.set_index(dup_col)[output_col_name].to_dict()
        data[output_col_name] = data[dup_col].apply(lambda txt: dup_dict[txt])
        return data

    # 2. drop dup ids, keep first
    def drop_dups(data, col):
        return data.drop_duplicates(subset=[col]).reset_index(drop=True)

    # 3. collect similar pairs
    def pairwise_similarity_matrix(data, id_col, text_col, similar_id_col, min_similarity=0.75):
        # TEXTUAL SIMILARITY

        # MODULE 1 :: pair-wise textual similarity
        matches = match_strings(master=data[text_col], master_id=data[id_col], min_similarity=min_similarity, vectorizer='bert')

        # group similar-pairs together (left-join)
        left_col_name, left_unique_id, right_unique_id = "left_%s" % text_col, "left_%s" % id_col, "right_%s" % id_col
        
        match_df = matches.groupby([left_col_name, left_unique_id])[right_unique_id].agg(similar_idx=lambda x: sorted(set(x))).reset_index().sort_values(by=[left_unique_id], ascending=True).reset_index(drop=True)

        # asthestic: drop dummy added left/right names
        matches = matches.drop(columns=['left_index', "right_index"])
        match_df = match_df.rename(columns={left_unique_id: id_col, left_col_name: text_col})
        
        print(match_df.columns)
        print(match_df.head(3))
        
        return matches, match_df

    ## Utility: alphanumeric sort
    _nsre = re.compile('([0-9]+)')
    def natural_sort_key(s):
        return [int(text) if text.isdigit() else text.lower()
                for text in re.split(_nsre, s)]

    # 4. create "dup_similar_idx" col - merge dup_id data with similar_id data
    def combine_dup_similar(data, match_df, id_col, dup_id_col, similar_id_col, dup_sim_id_col):

        # merge df_duplicates with df_similar
        cols_to_use = [id_col, similar_id_col]
        data = data.merge(match_df[cols_to_use], on=id_col, how='outer')

        # create combined list: dup_ids + similar_ids
        # --> "dup_similar_id_col" == "duplicated_pairs_idx" + "similar_pairs_idx
        data[dup_sim_id_col] = [sorted(set(sum(tup, []))) for tup in zip(data[dup_id_col], data[similar_id_col])]

        # custom sorting (to handle alphanumeric ids)
        if isinstance(data[dup_sim_id_col][0], str):
            data[dup_sim_id_col] = data[dup_sim_id_col].apply(lambda x: sorted(x, key=natural_sort_key))
        return data

    # 5. merged all nested lists containing common sub-elements in "dup_similar_id" cols
    def collect_similar_ids(data, id_col, dup_id_col, similar_id_col, dup_sim_id_col):

        # collect nested list which needs to be merged
        list_similar_ids = list(map(list, data[dup_sim_id_col]))

        # merge all nested lists with common elements
        g = nx.Graph()
        edges = [g.add_edges_from(zip(p, p[1:])) if len(p)>1 else g.add_edges_from(zip(p, p[:])) for p in list_similar_ids]
        merged_similar_idx = [sorted(c) for c in nx.connected_components(g)]

        # create two mappings, one for storing cluster_id: list of ids, and one inverted dict
        # --> "id_clus_dict" is the cluster id mapping for each 'unique_id'
        temp_id = 1
        clus_id_dict = {}  # cluster_1: merged([id1, id2,..., idn])
        id_clus_dict = {}  # merged(id1): cluster_1; merged(id1): cluster_1; .., merged(idn): cluster_1
        for lst in merged_similar_idx:
            key = "cluster_%s"%temp_id
            for value in lst:
                id_clus_dict[value] = key
            clus_id_dict[key]=lst
            temp_id+=1

        # assign dup_similar_idx based on two mappings above
        df[dup_sim_id_col] = df[id_col].apply(lambda uid: clus_id_dict[id_clus_dict[uid]])

        # create duplicate id mapping and similar id mapping files
        dup_id_dict = {_id: ids for ids in df[dup_id_col].tolist() for _id in ids}
        sim_id_dict = {_id: ids for ids in df[similar_id_col].tolist() for _id in ids}
        dup_sim_id_dict = {_id: ids for ids in df[dup_sim_id_col].tolist() for _id in ids}

        # custom sorting (to handle alphanumeric ids)
        if isinstance(data[dup_sim_id_col][0], str):
            df[dup_sim_id_col] = df[dup_sim_id_col].apply(lambda x: sorted(x, key=natural_sort_key))
        return data, clus_id_dict, id_clus_dict, dup_id_dict, sim_id_dict, dup_sim_id_dict

    # 6. Drop duplicates based on dup_similar_id_col, i.e. duplicated_id + similar_ids
    def create_final_single_matrix(data, dup_sim_id_col):
        data[dup_sim_id_col] = tuple(map(tuple, data[dup_sim_id_col]))
        data = data.drop_duplicates(subset=[dup_sim_id_col]).reset_index(drop=True)
        data[dup_sim_id_col] = list(map(list, data[dup_sim_id_col]))
        return data

    # 7. Expand each id to assign clusters
    def create_clusters(data, id_col, idx_cluster_map):
        data['cluster_id'] = data[id_col].apply(lambda uid: idx_cluster_map.get(uid, -1))
        return data

    # 8. Display Analytics
    def run_stats():
        print("Stats:\n"
          "\nOrignal number of records = {}"
          "\nTotal dup count = {}"
          "\nTotal similar pairs found = {}"
          "\nFinal number of records post dup-similar rows removal = {}".format(len(original_df),
                                                                              len(sum(df[master_dup_id_col].tolist(), [])),
                                                                              len(sum(df[master_similar_id_col].tolist(), [])),
                                                                              len(df)))
    """
    Run similarity computation on pre-processed Master file involving Dup identification, collection & removal, finally running
    similarity analysis on remaining unique rows.
    param    : dataframe (single file containing - 'unique_id', 'cleaned_text')
    return   : final dataframe with only unique rows, original dataframe with results (cluster info)
    """
    ## EXECUTE ##
    original_df = df.copy()
    df = collect_dups(df, master_id_col, master_clean_text_col, master_dup_id_col)
    df = drop_dups(df, master_clean_text_col)
    matches, match_df = pairwise_similarity_matrix(df, master_id_col, master_clean_text_col, master_similar_id_col, min_similarity=master_min_similarity)
    df = combine_dup_similar(df, match_df, master_id_col, master_dup_id_col, master_similar_id_col, master_dup_similar_id_col)
    df, cluster_id_map, idx_cluster_map, dup_id_dict, sim_id_dict, dup_sim_id_dict = collect_similar_ids(df,  master_id_col, master_dup_id_col, master_similar_id_col, master_dup_similar_id_col)
    df = create_final_single_matrix(df, master_dup_similar_id_col)
    df['cluster_id'] = df[master_id_col].apply(lambda uid: idx_cluster_map.get(uid, -1))
    # save back in original df (without dups or similar dropped!)
    original_df['dup_idx'] = original_df[master_id_col].apply(lambda uid: dup_id_dict.get(uid, -1))
    original_df['similar_idx'] = original_df[master_id_col].apply(lambda uid: sim_id_dict.get(uid, -1))
    original_df['dup_similar_idx'] = original_df[master_id_col].apply(lambda uid: dup_sim_id_dict.get(uid, -1))
    original_df['cluster_id'] = original_df[master_id_col].apply(lambda uid: idx_cluster_map.get(uid, -1))
    run_stats()
    return original_df

In [None]:
# :: config ::

master_id_col = 'ID'
master_text_col = 'Corpus'
master_clean_text_col = 'clean_text'
master_dup_id_col = "dup_idx"
master_similar_id_col = "similar_idx"
master_dup_similar_id_col = "dup_similar_idx"

# drop na
df[master_clean_text_col] = df[master_clean_text_col].fillna(value="NONE")
print("Settings set into config.")

In [None]:
# get conxtetually similar records

df = run_Similarity_on_MasterFile(df, master_min_similarity=0.75)

# Iterative Clustering

In [None]:
### Clustering Algorithm

class doc_clustering():

    def __init__(self, resource_dir, spacy_model, vectorizer):
        self.stopwd_file = os.path.join(resource_dir, "stopwords.txt")
        self.spacy_model = spacy_model
        self.vectorizer = vectorizer.strip().lower()
        # :: clustering config ::
        self.clustering_type = "noun"
        self.kmeans_rate = KMEANS_RATE
        self.cluster_k = CLUSTER_K
        self.kmeans_seed_init = KMEANS_SEED_INIT
        self.kmeans_maxiter = KMEANS_MAXITER
        self.cluster_length = CLUSTER_LEN
        self.cohesion_threshold = COHESION_THRESHOLD
        # :: resource config ::
        self.stopwords = []
        self.load_resources()
        self.test_mode = "no"

    def load_resources(self):
        # load stopwords
        with io.open(self.stopwd_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords = [x.rstrip() for x in f.readlines()]
        return

    def get_pos_list(self, results):
        # POS Annotation using Spacy
        pos_ans = []
        word_list = []
        lemma_list = []
        ne_list = []
        start_end_list = []
        indices = results['sentences']
        for line in indices:
            tokens = line['tokens']
            for token in tokens:
                pos_syn = ""
                pos = token['pos'].lower()
                pos_ans.append(token['pos'])
                word_list.append(token['word'])
                lemma = token['lemma'].lower()
                if lemma in self.stopwords and lemma not in ['want', 'against', 'further', 'online', 'same', 'under',
                                                             'what', 'want', 'when', 'own', ''] \
                        or lemma in [":", "-lrb-", "-rrb-", "-lsb-", "-rsb-", "\\", '-pron-', '_', 'card num', '"'] \
                        or pos == ":" or pos == "." or re.search('^([\W]*)$]', lemma) or len(lemma) >= 30:
                    continue
                if re.search('^nn', pos):
                    pos_syn = 'noun'
                elif re.search('^v', pos):
                    pos_syn = 'verb'
                elif re.search('^adj', pos):
                    pos_syn = 'adj'
                lemma_list.append(lemma)
                ne_list.append(token['ner'])
                start_end_list.append(str(token['characterOffsetBegin']) + "_" + str(token['characterOffsetEnd']))

        if len(lemma_list) == 0:
            sent_lemma = "NO_NP_KEYS"
        else:
            sent_lemma = " ".join(lemma_list)
        return " ".join(word_list), sent_lemma, " ", " "

    def get_sent_lemma(self, sent):
        # allowed spacy attributes
        props_str = [{'value': 'false', 'key': 'enforceRequirements'},
                     {'value': 'json', 'key': 'outputFormat'},
                     {'value': 'normalizeSpace=false, strictTreebank3=true', 'key': 'tokenize.options'},
                     {'value': 'tokenize,ssplit,pos,lemma,ner', 'key': 'annotators'},
                     {'value': 'true', 'key': 'ssplit.eolonly'}]
        input_json = {'text': sent, 'props': props_str}
        results = run_pipeline(input_json, self.spacy_model)
        return self.get_pos_list(results)

    def run_doc_clustering(self, documents, k_val, run_analysis=False):

        max_iter = self.kmeans_maxiter
        ninit = self.kmeans_seed_init
        r_num = len(documents) - 1

        # Vectorization
        if self.vectorizer=="tfidf":
            vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1, 1))
            X = vectorizer.fit_transform(documents)
            print("Vectorizing using...tfidf")
        elif self.vectorizer=="bert":
            X = F.normalize(bert_vectorizer.transform(documents), p=2, dim=1)
            X = csr_matrix(X.numpy().astype(np.float64))
            print("Vectorizing using...bert")
        else:
            raise Exception("ERROR: Please specify a vectorizer to use: options: ['tfidf', 'bert']")
        print("Vectorization complete!")

        # Value of "K" clusters
        true_k = k_val = int(k_val)

        ## --> Heusristics <--
        if self.clustering_type == "fixed": true_k = 2000
        elif self.clustering_type == "noun": true_k = int(np.sqrt(len(documents) / 2)) * self.kmeans_rate
        elif self.clustering_type == "verb": true_k = int(np.sqrt(len(documents) / 2))
        else: print("Please set up clustering type! Options=['fixed', 'noun', 'verb']")
        print("Heusristic value of K: true_k=", true_k)

        ## --> Elbow method (silhouette) <--
        if run_analysis and k_val==0 and true_k>=25:
            search_range = range(true_k-20, true_k+20, 1)
            sil_score_max = -1 # minimum possible score
            for n_clusters in search_range:
                model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=2000, random_state=r_num, n_init=ninit)
                labels = model.fit_predict(X)
                sil_score = silhouette_score(X, labels)
                print("The average silhouette score for %i clusters is %0.2f" %(n_clusters, sil_score))
                if sil_score > sil_score_max:
                    true_k, sil_score_max = n_clusters, sil_score
            print("Silhouette Analysis completed! Best_n_clusters K=", true_k)

        if k_val > 0 and self.clustering_type == "noun":
            if len(documents) / 2 < k_val:
                debug_msg = "Number of input lines: {}\n" +                              "K value provided: {}\n" +                              "Suggested value of K for the 1st level clustering is less than input-lines divided by 2\n" +                              "Please try again!  Heusristic value of K (if line number > 100): {}"                            .format(len(documents), k_val, int(np.sqrt(len(documents)/2))*self.kmeans_rate)
                raise Exception(debug_msg)
            else:
                true_k = k_val

        # Final Clustering
        if true_k == 0: true_k = 1
        model = KMeans(n_clusters=true_k, init='k-means++', max_iter=2000, random_state=r_num, n_init=ninit)
        model.fit(X)

        cluster_labels = model.labels_
        cluster_centers = model.cluster_centers_
        print("Clustering is complete.")

        return model, cluster_labels, cluster_centers, X

#### Clustering Utility

In [None]:
def cluster_cohesion_cosine(clustering, cluster_set_by_sid, data):
    '''
    :: COESHION SCORE ::
    Calculates cohesion score for each cluster id. Accepts 1 cluster with all members, and re-runs clustering on them,
    with k=1, e.g. KMeans(cluster_id_1_documents, k=1). Calculates relative distances between each member.
    '''
    document_center = []
    values = []
    for sid in cluster_set_by_sid:
        document_center.append(data[sid]['sent_lemma'])

    # clustering one cluster members with k=1 to find relatvie distances
    model_center, cluster_labels_, cluster_centers_center, X_center = clustering.run_doc_clustering(document_center, 1)

    for i in range(0, len(cluster_set_by_sid)):
        d_vec = X_center[i].toarray()
        simval = spatial.distance.cosine(d_vec, cluster_centers_center[0,:])
        if math.isnan(simval):
            values.append(0)
        else:
            values.append(1 - simval)
    if len(values) == 0:
        cohesion_score = 0
    else:
        # avg distance of all members
        cohesion_score = sum(values) / len(values)
    return cohesion_score

def cluster_center_sent(clustering, cluster_set_by_sid, data):
    '''
    :: CLUSTER CENTRE ::
    Calculates cluster centre using model capability to return cluster centre values, and calculates closest two members.
    '''
    document_center = []
    sid_mapping_center = {}
    sid_tmp = 0
    sent_num_cid = len(cluster_set_by_sid)

    dup_sid_set = []
    for sid in cluster_set_by_sid:
        sid_mapping_center[sid_tmp] = sid
        if len(data[sid]['sent_lemma']) > 0:
            dup_sid_set.append(sid)
        document_center.append(data[sid]['sent_lemma'])
        sid_tmp +=1

    if len(dup_sid_set) == 1:
        seedid1 = list(dup_sid_set)[0]
        return seedid1, seedid1

    # clustering one cluster members with k=1 to find relatvie distances
    model_center, cluster_labels_, cluster_centers_center, X_center = clustering.run_doc_clustering(document_center, 1)

    DISTMIN = -9999

    values = []
    values_sid_set = []
    # remove duplicated ones
    for i in range(0, sent_num_cid):
        sid_data = sid_mapping_center[i]
        if data[sid_data]["count"] > 0:
            d_vec = X_center[i].toarray()
            # values.append(spatial.distance.euclidean(d_vec, cluster_centers_center[0,:]))
            if math.isnan(spatial.distance.cosine(d_vec, cluster_centers_center[0,:])):
                values.append(1)
            else:
                values.append(1 - spatial.distance.cosine(d_vec, cluster_centers_center[0,:]))
            # score = 1 - spatial.distance.cosine(d_vec, cluster_centers[int(cid),:]) #mainid
            values_sid_set.append(sid_data)

    seedid1_tmp = values.index(max(values))
    min_dist1 = values[seedid1_tmp]
    values[seedid1_tmp] = DISTMIN
    seedid1 = values_sid_set[ seedid1_tmp ]

    seedid2_tmp = values.index(max(values))
    min_dist2 = values[seedid2_tmp]
    seedid2 = values_sid_set[ seedid2_tmp ]
    values[seedid1_tmp] = min_dist1

    return seedid1, seedid2, values, values_sid_set

#### Clustering Pipeline

In [None]:
def run_clustering(df_cluster, spacy_model, preprocess=False, vectorizer='bert'):

    # init clustering object (alogrithm)
    clustering = doc_clustering(resources_dir_path, spacy_model, vectorizer)

    start = time.time()
    ################################################################################################
    # 1. PREPARE DATA
    ################################################################################################

    print("Uploading input file and lemmatizing...")
    # create 'data' dict to store every info regarding each sentence
    data = OrderedDict()

    for index, row in df_cluster.iterrows():
        uid, input_text, dup_similar_idx = row[master_id_col], row[master_clean_text_col], row[master_dup_similar_id_col]
        if len(input_text.strip())==0: input_text = 'NONE'
        original_text = input_text.strip()

        if preprocess:
            # run spacy complete pipeline
            sent_tokens, sent_lemma, sent_verb, sent_dup = clustering.get_sent_lemma(original_text)
            input_text = str(sent_lemma)

        # features
        upper_cnt = sum(1 for c in input_text if c.isupper())
        case_ratio = float(upper_cnt) / float(len(input_text))
        data.setdefault(index, {})["sent_lemma"] = str(input_text)
        # :not: data.setdefault(index, {})["sent_noun"] = str(sent_lemma)
        # :not: data.setdefault(index, {})["sent_verb"] = str(sent_verb)
        # :not: data.setdefault(index, {})["sent_tokens"] = sent_tokens
        data.setdefault(index, {})["count"] = len(dup_similar_idx)
        data.setdefault(index, {})["case_ratio"] = case_ratio
        data.setdefault(index, {})["clusterid"] = "no"
        data.setdefault(index, {})["keep"] = "yes"
        data.setdefault(index, {})["select"] = "no"
        data.setdefault(index, {})["index"] = index
        data.setdefault(index, {})["unique_id"] = uid
        data.setdefault(index, {})["sent_raw"] = original_text

    ################################################################################################
    # 2. PREPARE CORPUS
    ################################################################################################
    print("Collecting text corpus")
    documents = []
    main_docid_id_map = {}  # id in documents vs real id in input
    doc_id = 0
    for id, x in list(data.items()):
        if data[id]["count"] >= 0:
            main_docid_id_map[doc_id] = id
            documents.append(data[id]['sent_lemma'])
            doc_id += 1

    if len(documents) < 10:
        raise Exception("The file contains sentences less than 10! Please check  and run again!")
    else:
        print('>> Total input sentences = ', len(documents))

    ################################################################################################
    # 3. 1st LEVEL CLUSTERING
    ################################################################################################
    print("\nPerforming 1st level clustering")

    # init, using heursitics to find value of k
    cluster_k = 0

    clustering.clustering_type = "noun"
    model, cluster_labels, cluster_centers, X = clustering.run_doc_clustering(documents, cluster_k, run_analysis=False)

    # cluster_id : question id
    c_s_list = [(y,x) for x,y in enumerate(cluster_labels)]

    ################################################################################################
    # 4. COLLECT CLUSTERED IDs
    ################################################################################################
    print("\nCollecting clustered ids")

    # add first level cluster-id to data dict
    # --> cluster_id : [Q_id1, Q_id2, ..., Q_idn]
    cid_sid_counter = {}
    for cid, sid in c_s_list:
        main_sid = main_docid_id_map[sid]
        data.setdefault(main_sid, {})["clusterid"] = str(format(cid, "05d"))
        cid_sid_counter.setdefault(cid, []).append(main_sid)

    ################################################################################################
    # 5. CHECK REQUIREMENT FOR 2nd LEVEL CLUSTERING
    ################################################################################################
    print("\nFine-tuning clusters: Checking if 2nd level clustering is required...")

    clustering.clustering_type = "noun"
    cohesionThreshold = clustering.cohesion_threshold
    clusteringMaxLen = clustering.cluster_length
    print('cohesionThreshold==', cohesionThreshold)

    for cid in sorted(cid_sid_counter.keys()):

        # --> documents under cluster_id = 'cid'
        documents_sub =[]
        for sid in cid_sid_counter[cid]:
            documents_sub.append(data[sid]['sent_lemma'])

        if len(documents_sub) > clusteringMaxLen:
            cohesion_score = cluster_cohesion_cosine(clustering, cid_sid_counter[cid], data)
            print('cid -- cohesion_score = ', cid, cohesion_score)

            # RE-CLUSTERING   (to use cohesion value for basis of reclustering)
            if cohesion_score < cohesionThreshold:
                print('**re-clustering: cluster_id={}; member count={}; cohesion={}'.format(cid, len(documents_sub), cohesion_score))
                model_sub, cluster_labels_sub, cluster_centre_sub, X_sub = clustering.run_doc_clustering(documents_sub, cluster_k)
            else:
                continue
        else:
            continue

        # :: ONLY if 2nd clustering was performed ::
        c_s_list_sub = [(y,x) for x,y in enumerate(cluster_labels_sub)]

        # add 2nd level cluster-id to data dict
        # --> cluster_id : [Q_id1, Q_id2, ..., Q_idn]
        for cid_sub,sid_sub in c_s_list_sub:
            sid_index = cid_sid_counter[cid][sid_sub]
            data[sid_index]['clusterid'] = data[sid_index]['clusterid'] + "_"+str(format(cid_sub, "03d"))

    # reset to first level
    clustering.clustering_type = "noun"
    cluster_set_by_sid = {}
    for id, x in list(data.items()):
        if data[id]['clusterid'] != "no":
            cid = data[id]['clusterid']
            cluster_set_by_sid.setdefault(cid, []).append(id)

    ################################################################################################
    # 6. FILTER, FIND INSIGHTS & SAVE
    ################################################################################################
    print("\nFiltering, getting insights and saving output")

    # SAVE OUTPUT (CSV)
    output=[]
    output.append("{}\t{}\tText\tSeed_Q1\tSeed_Q2\tDist_to_Center\tCohesion\r\n".format(master_clustering_col,master_id_col))

    T = 0.0  # recommended: 0.4
    cid_index = 0
    cluster_set_by_sid_filtered = {}
    cluster_set_by_sid_filtered_coh_score = {}
    for cid in list(cluster_set_by_sid.keys()):
        if len(cluster_set_by_sid[cid]) > 1:
            cohesion_score = cluster_cohesion_cosine(clustering, cluster_set_by_sid[cid], data)
            if cohesion_score < T:
                for s in cluster_set_by_sid[cid]:
                    cluster_set_by_sid_filtered.setdefault(cid_index,[]).append(s)
                    cid_index += 1
            else:
                cluster_set_by_sid_filtered[cid_index] = cluster_set_by_sid[cid]
                cluster_set_by_sid_filtered_coh_score[cid_index] = cohesion_score
                cid_index += 1
        else:
            cluster_set_by_sid_filtered[cid_index] = cluster_set_by_sid[cid]
            cluster_set_by_sid_filtered_coh_score[cid_index] = 0
            cid_index += 1

    output_index=0
    for cid in list(cluster_set_by_sid_filtered.keys()):
        sent_num_cid = len(cluster_set_by_sid_filtered[cid])
        values = []
        values_sid = []

        if sent_num_cid > 0 and len(cluster_set_by_sid_filtered[cid]) > 1:
            seedid1,seedid2, values, values_sid = cluster_center_sent(clustering, cluster_set_by_sid_filtered[cid], data)
        else:
            seedid1 = cluster_set_by_sid_filtered[cid][0]
            seedid2 = cluster_set_by_sid_filtered[cid][0]
            values.append(1)
            values_sid.append(seedid1)

        seed_q1 = data[seedid1]["sent_raw"]
        seed_q2 = data[seedid2]["sent_raw"]

        sent_count = 0
        for sid in cluster_set_by_sid_filtered[cid]:
            sent_count += data[sid]["count"]
        ch_score= cluster_set_by_sid_filtered_coh_score[cid]

        for sid in cluster_set_by_sid_filtered[cid]:
            dup_count = data[sid]['count']
            if dup_count == 0:
                continue
            else:
                sid_norm = data[sid]["index"]
                sent = data[sid_norm]['sent_raw']
            sim_index = values_sid.index(sid)
            dist_to_center = 1.0 - values[sim_index]
            unique_id = data[sid]["unique_id"]

            # cid, sid, unique_id, sent, seed_q1, seed_q2, dist_to_center, cohesion_score
            output.append("%d\t%s\t%s\t%s\t%s\t%5.3f\t%5.3f\r\n" % (cid, unique_id, sent, seed_q1, seed_q2, dist_to_center, ch_score))
        cid_index += 1
    clustered_output = pd.read_csv(StringIO("\n".join(output)), sep="\t").reset_index(drop=True)
    print("\nClustering process finished! Time taken(s) =", (time.time()-start))

    return clustered_output

#### Clustering Config

In [None]:
## :: dataset config :: ##

master_id_col = 'ID'                               # already present!, mention the col name
master_clean_text_col = 'clean_text'                      # already present!, mention the col name
master_dup_id_col = "dup_idx"                             # already present!, mention the col name
master_similar_id_col = "similar_idx"                     # already present!, mention the col name
master_dup_similar_id_col = "dup_similar_idx"             # already present!, mention the col name
master_freq_col = "FREQ"                                  # already present!, mention the col name
master_coverage_col = "COVERAGE"                          # already present!, mention the col name
master_clustering_col = 'FINAL_CLUSTER_ID'

# :: clustering config ::
KMEANS_RATE = 5
CLUSTER_K = 0
KMEANS_SEED_INIT = 50
KMEANS_MAXITER = 1000
CLUSTER_LEN = 20
COHESION_THRESHOLD = 0.60
CLUSTER_LEN_2nd_LEVEL = 100
CLUSTER_LEN_threshold = 2000
OUTPUT_DUP = "yes"

In [None]:
df.head(3)

#### Clustering

In [None]:
## clustering

# clustering df ---> df without duplicates, or very similar records
df[master_dup_similar_id_col] = df[master_dup_similar_id_col].apply(lambda x: tuple(x))
df_cluster = df.drop_duplicates(subset=[master_dup_similar_id_col]).reset_index(drop=True)
print('\nTotal Datapoints :: df.shape = ', df.shape)
print('\nDatapoitns with contextually similar records :: df_cluster.shape = ', df_cluster.shape)

# EXECUTE CLUSTERING
clustered_output = run_clustering(df_cluster, spacy_model=nlp, preprocess=False, vectorizer='bert')

# Output

In [None]:
### :: FINAL DATAFRAME :: ###

# 1. MERGE 'clustered_output' WITH 'df_cluster' on UNIQUE_ID TO CREATE a 'temp'
temp = df_cluster[[master_id_col, master_dup_similar_id_col]].merge(clustered_output[[master_id_col, master_clustering_col,'Seed_Q1','Seed_Q2','Dist_to_Center','Cohesion']], on = master_id_col, how = 'left')

# 2. MERGE 'temp' WITH ORIGNAL DATA (without any drops) 'new_df'
df = df.merge(temp[[master_dup_similar_id_col,  master_clustering_col,'Seed_Q1', 'Seed_Q2','Dist_to_Center','Cohesion']], on = master_dup_similar_id_col,how = 'left')

# 3. GET INSIGHTS (Frequency, Coverage)
df[master_clustering_col] = df[master_clustering_col].fillna(-1).astype(int)
cid_count = Counter(df[master_clustering_col])
df[master_freq_col] = df[master_clustering_col].apply(lambda cid: cid_count.get(cid, 0))
df[master_coverage_col] = df[master_freq_col].apply(lambda x: int(x)*100.0/sum(cid_count.values()))
df = df.drop(columns=['cluster_id'])

print("Clusters generated:", len(cid_count))
print("Process finished!")

In [None]:
df.to_csv("df_parent_clusteredResults_26112021.csv", index=False)