# Fast Text Clustering

A modern approach to clustering textual data using 2-passes of K-Means Algorithm 

----

## Contents:

`Total Read Time ~ 20 mins.`

`Total Execution Time ~ 45 mins.`

#### Clustering
1. Imports
2. Directory Setup
3. Load Data
4. Preprocessing
5. Set Configuration
6. Vectorization
7. Find Near-Duplicates using pair-wise similarity
8. Clustering Dataset
9. Clustering
10. Final Output
11. Metrics and Insights

## 1. Imports

In [2]:
## Imports
'''Python 3.8.0'''

# Standard libs
import os
import sys
import json
import warnings
import re
import io
from io import StringIO
import inspect
import shutil
import ast
import string
import time
import pickle
import glob
import traceback
import multiprocessing
import requests
import logging
import math
from ast import literal_eval
import pytz
from itertools import chain
from string import Template
from datetime import datetime, timedelta
from dateutil import parser
import base64
from collections import defaultdict, Counter, OrderedDict
from contextlib import contextmanager
import unicodedata
from functools import reduce
import itertools
import tempfile
from typing import Any, Dict, List, Callable, Optional, Tuple, NamedTuple, Union
from functools import wraps
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

# graph
import networkx as nx

# Required pkgs
import numpy as np
from numpy import array, argmax
import pandas as pd
import ntpath
import tqdm

# General text correction - fit text for you (ftfy) and others
import ftfy
from fuzzywuzzy import fuzz
from wordcloud import WordCloud
from spellchecker import SpellChecker

# scikit-learn
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, jaccard_score, silhouette_score, homogeneity_score, calinski_harabasz_score
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.base import BaseEstimator, TransformerMixin

# scipy
from scipy import spatial, sparse
from scipy.sparse import coo_matrix, vstack, hstack
from scipy.spatial.distance import euclidean, jensenshannon, cosine, cdist
from scipy.io import mmwrite, mmread
from scipy.stats import entropy
from scipy.cluster.hierarchy import dendrogram, ward, fcluster
import scipy.cluster.hierarchy as sch
from scipy.sparse.csr import csr_matrix
from scipy.sparse.lil import lil_matrix
from scipy.sparse.csgraph import connected_components

# sparse_dot_topn: matrix multiplier
from sparse_dot_topn import awesome_cossim_topn
import sparse_dot_topn.sparse_dot_topn as ct

# Gensim
import gensim
from gensim.models import Phrases, Word2Vec, KeyedVectors, FastText, LdaModel
from gensim import utils
from gensim.utils import simple_preprocess
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api
from gensim import models, corpora, similarities

# NLTK
import nltk
#nltk_model_data_path = "/somepath/"
#nltk.data.path.append(nltk_model_data_path)
from nltk import FreqDist, tokenize, sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords, PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import *
from nltk.translate.bleu_score import sentence_bleu
print("NLTK loaded.")

# Spacy
# Spacy
import spacy
from spacy import displacy
from spacy.matcher import Matcher
#from spacy.lang.en import English
from spacy.language import Language
from spacy_language_detection import LanguageDetector
print("Spacy loaded.")

# Pytorch
import torch
from torch import optim, nn
import torch.nn.functional as Functional
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
from transformers import pipeline
from transformers import AutoModel
print("PyTorch loaded.")

# Plots
from matplotlib import pyplot as plt, ticker as ticker
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly import offline
%matplotlib inline

# Theme settings
pd.set_option("display.max_columns", 80)
sns.set_context('talk')
sns.set(rc={'figure.figsize':(15,10)})
sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

0it [00:00, ?it/s]

NLTK loaded.
Spacy loaded.
PyTorch loaded.


## 2. Directory Setup

In [3]:
root_dir = os.path.abspath(".")
data_dir = os.path.join(root_dir, "data")

# Generic NLP resources
nlp_resources_fp = "../../_Resources_/nlp_resources/"

# Sentence Bert
sbert_model_fp = "../../_Resources_/transformer_models/all-distilroberta-v1/"

# Spacy
spacy_model_data_path = "../../_Resources_/spacy/en_core_web_lg-3.5.0/en_core_web_lg/en_core_web_lg-3.5.0"
nlp = spacy.load(spacy_model_data_path)  # disabling: nlp = spacy.load(spacy_data_path, disable=['ner'])
def create_lang_detector(nlp, name): 
    return LanguageDetector()
Language.factory("language_detector", func=create_lang_detector)
nlp.max_length = 2000000
nlp.add_pipe('language_detector', last=True)

<spacy_language_detection.spacy_language_detector.LanguageDetector at 0x7fd753428bb0>

## 3. Load Data

In [58]:
df = pd.read_csv(os.path.join(data_dir, "tweets.csv"))
print(df.shape)
df.head(10)

(3394, 2)


Unnamed: 0,tweet,label
0,"All campus dining locations are closed today, ...",cu_others
1,"What are your campus dining options today, Jan...",cu_others
2,#FPGA Design for #Embedded #Systems\n\n#SoC #V...,cu_online
3,"What are your campus dining options today, Jan...",cu_others
4,As an anthro PhD student I’m frequently asked ...,cu_research
5,True or False: Concussions for college student...,cu_research
6,@michaelgrandner @CUBoulder Very interesting w...,cu_research
7,@ShellyMBoulder Thanks so much @ShellyMBoulder...,cu_research
8,"@LiberalsAreMean @CUBoulder Ha! In my day job,...",cu_research
9,Dr. Hutchison speaks to @5280Magazine about th...,cu_online


In [59]:
df = df[:500]

## 4. Preprocessing

In [192]:
class preprocessText:
    
    def __init__(self, resources_dir_path, custom_vocab=[], MIN_TOKENS=5, MAX_TOKENS=1000000, do_lemma=False):
        self.stopwords_file = os.path.join(resources_dir_path, "stopwords.txt")
        self.special_stopwords_file = os.path.join(resources_dir_path, "special_stopwords.txt")
        self.special_characters_file = os.path.join(resources_dir_path, "special_characters.txt")
        self.contractions_file = os.path.join(resources_dir_path, "contractions.json")
        self.chatwords_file = os.path.join(resources_dir_path, "chatwords.txt")
        self.emoticons_file = os.path.join(resources_dir_path, "emoticons.json")
        self.greeting_file = os.path.join(resources_dir_path, "greeting_words.txt")
        self.signature_file = os.path.join(resources_dir_path, "signature_words.txt")
        self.preserve_key = "<$>" # preserve special vocab
        self.vocab_list = custom_vocab
        self.preseve = True if len(custom_vocab) > 0 else False
        self.load_resources()
        self.do_lemma = do_lemma
        self.lang = []
        self.min_tokens = MIN_TOKENS
        self.max_tokens = MAX_TOKENS
        return
    
    def load_resources(self):
        
        ### Build Vocab Model --> Words to keep
        self.vocab_list = set(map(str.lower, self.vocab_list))
        self.vocab_dict = {w: self.preserve_key.join(w.split()) for w in self.vocab_list}
        self.re_retain_words = re.compile('|'.join(sorted(map(re.escape, self.vocab_dict), key=len, reverse=True)))
        
        ### Build Stopwords Model --> Words to drop/delete
        with open(self.stopwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords = [x.rstrip() for x in f.readlines()]
        with open(self.special_stopwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords.extend([x.rstrip() for x in f.readlines()])
        with open(self.special_characters_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords.extend([x.rstrip() for x in f.readlines()])
        self.stopwords = list(sorted(set(self.stopwords).difference(self.vocab_list)))

        ### Build Contractions
        with open(self.contractions_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.contractions = dict(json.load(f))
        
        ### Build Chat-words
        with open(self.chatwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.chat_words_map_dict, self.chat_words_list = {}, []
            chat_words = [x.rstrip() for x in f.readlines()]
            for line in chat_words:
                cw = line.split("=")[0]
                cw_expanded = line.split("=")[1]
                self.chat_words_list.append(cw)
                self.chat_words_map_dict[cw] = cw_expanded
            self.chat_words_list = set(self.chat_words_list)
        
        ### Bukd social markups
        
        # Emoticons
        with open(self.emoticons_file, "r") as f:
            self.emoticons = re.compile(u'(' + u'|'.join(k for k in json.load(f)) + u')')
        
        # Emojis
        self.emojis = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        
        # Greeting
        with open(self.greeting_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.greeting_words = [x.rstrip() for x in f.readlines()]
        
        # Signature
        with open(self.signature_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.signature_words = [x.rstrip() for x in f.readlines()]
        
        # Spell-corrector (takes too long!)
        self.spell_checker = SpellChecker()   
        return
    
    
    def reserve_keywords_from_cleaning(self, text, reset=False):
        """ 
        Finds common words from a user-provided list of special keywords to preserve them from 
        cleaning steps. Identifies every special keyword and joins them using `self.preserve_key` during the 
        cleaning steps, and later resets it back to original word in the end.
        """
        if reset is False:
            # compile using a dict of words and their expansions, and sub them if found!
            match_and_sub = self.re_retain_words.sub(lambda x: self.vocab_dict[x.string[x.start():x.end()]], text)
            return re.sub(r"([\s\n\t\r]+)", " ", match_and_sub).strip()
        else:
            # reverse the change! - use this at the end of preprocessing
            text = text.replace(self.preserve_key, " ")
            return re.sub(r"([\s\n\t\r]+)", " ", text).strip()


    def basic_clean(self, input_sentences):
        cleaned_sentences = []
        for sent in tqdm.tqdm(input_sentences):
            # lowercasing
            sent = str(sent).strip().lower()
            # FIX text
            sent = ftfy.fix_text(sent)
            # Normalize accented chars
            sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Removing <…> web scrape tags
            sent = re.sub(r"\<(.*?)\>", " ", sent)
            # Expanding contractions using contractions_file
            sent = re.sub(r"(\w+\'\w+)", lambda x: self.contractions.get(x.group().lower(), x.group().lower()), sent)
            # Removing web urls
            sent = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0–9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", sent)
            # Removing date formats
            sent = re.sub(r"(\d{4}\-\d{2}\-\d{2}\s\d{2}\:\d{2}\:\d{2}\s\:)", " ", sent)
            # Removing extra whitespaces
            sent = re.sub(r"([\s\n\t\r]+)", " ", sent).strip()
            cleaned_sentences.append(sent)
        return cleaned_sentences


    def deep_clean(self, input_sentences):
        cleaned_sentences = []
        for sent in tqdm.tqdm(input_sentences):
            # normalize text to "utf-8" encoding
            sent = unicodedata.normalize('NFKD', str(sent)).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # lowercasing
            sent = str(sent).strip().lower()

            # <--------------------------------------------------------------------------- >
            # <----------------------------- CUSTOM CLEANING ----------------------------- >
            # <--------------------------------------------------------------------------- >
            #
            # *** Mark important keywords such as: Domain specific, Question words(wh-words), etc, using 
            # "self.vocab_list". Words from this list if found in any input sentence shall be joined using 
            # a key (self.preserve_key) during pre-processing step, and later un-joined to retain them.
            #
            # TWITTER SPECIFIC
            # ----------------
            twitter_anchor_re = "(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)"
            sent = re.sub(r"{}".format(twitter_anchor_re), " ", sent)
            
            if self.preseve: 
                sent = self.reserve_keywords_from_cleaning(sent, reset=False)
                
            # Tokenizing with NLTK's TweetTokenizer (limiting repeated characters to 3 with the reduce lens
            # paramater and strips all the @'s. It also splits it into 1-gram tokens
            tweetToknzr = TweetTokenizer(strip_handles = True, reduce_len = True)
            sent = " ".join(tweetToknzr.tokenize(sent))
            
            # Remove prefix of hastags (#) 
            sent = " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", sent).split())
            #
            #
            # <--------------------------------------------------------------------------- >
            # <----------------------------- CUSTOM CLEANING ----------------------------- >
            # <--------------------------------------------------------------------------- >

            # remove Emojis 😜 🔥🔥
            sent = self.emojis.sub(r'', sent)
            # remove Emoticons ( ͡❛ ͜ʖ ͡❛)
            sent = self.emoticons.sub(r'', sent)
            # remove common chat-words
            sent = " ".join([self.chat_words_map_dict[w.upper()] if w.upper() in self.chat_words_list else w for w in sent.split()])
            # FIX text
            sent = ftfy.fix_text(sent)
            # Normalize accented chars
            sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Removing <…> web scrape tags
            sent = re.sub(r"\<(.*?)\>", " ", sent)
            # Removing web-links
            sent = " ".join([re.sub(r'^https?:\/\/.*[\r\n]*', '', token.strip(), flags=re.MULTILINE) for token in sent.split()])
            # Expanding contractions using contractions_file
            sent = re.sub(r"(\w+\'\w+)", lambda x: self.contractions.get(x.group().lower(), x.group().lower()), sent)
            # Removing web urls
            sent = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0–9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", sent)
            # Removing date formats
            sent = re.sub(r"(\d{4}\-\d{2}\-\d{2}\s\d{2}\:\d{2}\:\d{2}\s\:)", " ", sent)

            # sentence language detection ('en', 'fr', 'gr', etc)
            lang = nlp(sent)._.language['language']
            self.lang.append(lang)

            # <--------------------------------------------------------------------------- >
            # <----------------------------- OPTIONAL CLEANING ----------------------------- >
            # <--------------------------------------------------------------------------- >
            #
            # removing punctuations 
            # *** disable them, when sentence structure needs to be retained ***
            sent = re.sub(r"[\$|\#\@\*\%]+\d+[\$|\#\@\*\%]+", " ", sent)
            sent = re.sub(r"\'s", " \'s", sent)
            sent = re.sub(r"\'ve", " \'ve", sent)
            sent = re.sub(r"n\'t", " n\'t", sent)
            sent = re.sub(r"\'re", " \'re", sent)
            sent = re.sub(r"\'d", " \'d", sent)
            sent = re.sub(r"\'ll", " \'ll", sent)
            sent = re.sub(r"[\/,\@,\#,\\,\{,\},\(,\),\[,\],\$,\%,\^,\&,\*,\<,\>]", " ", sent)
            sent = re.sub(r"[\,,\;,\:,\-]", " ", sent)      # main puncts
            
            # remove sentence de-limitters 🔥🔥
            # *** disable them, when sentence boundary/ending is important ***
            # sent = re.sub(r"[\!,\?,\.]", " ", sent)

            # keep only text & numbers 🔥🔥
            # *** enable them, when only text and numbers matter! *** 
            # sent = re.sub(r"\s+", " ", re.sub(r"[\\|\/|\||\{|\}|\[|\]\(|\)]+", " ", re.sub(r"[^A-z0-9]", " ", str(sent))))
            
            # correct spelling mistakes 🔥🔥
            # *** TAKES TOO LONG! enable them when english spelling mistakes matter *** 
            # sent = " ".join([self.spell_checker.correction(w) if w in self.spell_checker.unknown(sent.split()) else w for w in sent.split()])
            
            # Limit Min - Max Token Length
            sent = "" if len(sent.split()) > self.max_tokens or len(sent) <= self.min_tokens else sent
            #
            # <--------------------------------------------------------------------------- >
            # <----------------------------- OPTIONAL CLEANING ----------------------------- >
            # <--------------------------------------------------------------------------- >
            
            # Remove stopwords
            sent = " ".join(token.text for token in nlp(sent) if token.text not in self.stopwords and token.lemma_ not in self.stopwords)
            # Lemmatize
            if self.do_lemma:
                sent = " ".join(token.lemma_ for token in nlp(sent))
            # Removing extra whitespaces
            sent = re.sub(r"([\s\n\t\r]+)", " ", sent).lower().strip()

            # <----------------------------- CUSTOM CLEANING ----------------------------- >
            #
            # *** Reverse the custom joining now to un-join the special words found!
            if self.preseve: 
                sent = self.reserve_keywords_from_cleaning(sent, reset=True)
            #
            # <----------------------------- CUSTOM CLEANING ----------------------------- >

            cleaned_sentences.append(sent.strip().lower())
        return cleaned_sentences


    def spacy_get_pos_list(self, results):
        word_list, pos_list, lemma_list, ner_list, start_end_list = [], [], [], [], []
        indices = results['sentences']
        for line in indices:
            tokens = line['tokens']
            for token in tokens:
                # (1). save tokens
                word_list.append(token['word'])
                # (2). save pos
                pos_list.append(token['pos'])
                # (3). save lemmas
                lemma = token['lemma'].lower()
                if lemma in self.stopwords: continue
                lemma_list.append(lemma)
                # (4). save NER
                ner_list.append(token['ner'])
                # (5). save start
                start_end_list.append(str(token['characterOffsetBegin']) + "_" + str(token['characterOffsetEnd']))
        output = {"word_list": word_list, 
                  "lemma_list": lemma_list, 
                  "token_start_end_list": start_end_list,
                  "pos_list": pos_list, "ner_list": ner_list}
        return output

    def spacy_generate_features(self, doc, operations='tokenize,ssplit,pos,lemma,ner'):
        """
        Spacy nlp pipeline to generate features such as pos, tokens, ner, dependency. Accepts doc=nlp(text)
        """
        # spacy doc
        doc_json = doc.to_json()  # Includes all operations given by spacy pipeline

        # Get text
        text = doc_json['text']

        # ---------------------------------------- OPERATIONS  ---------------------------------------- #
        # 1. Extract Entity List
        entity_list = doc_json["ents"]

        # 2. Create token lib
        token_lib = {token["id"]: token for token in doc_json["tokens"]}

        # init output json
        output_json = {}
        output_json["sentences"] = []

        # Perform spacy operations on each sent in text
        for i, sentence in enumerate(doc_json["sents"]):
            # init parsers
            parse = ""
            basicDependencies = []
            enhancedDependencies = []
            enhancedPlusPlusDependencies = []

            # init output json
            out_sentence = {"index": i, "line": 1, "tokens": []}
            output_json["sentences"].append(out_sentence)

            # 3. Split sentences by indices(i), add labels (pos, ner, dep, etc.)
            for token in doc_json["tokens"]:

                if sentence["start"] <= token["start"] and token["end"] <= sentence["end"]:
                    
                    # >>> Extract Entity label
                    ner = "O"
                    for entity in entity_list:
                        if entity["start"] <= token["start"] and token["end"] <= entity["end"]:
                            ner = entity["label"]

                    # >>> Extract dependency info
                    dep = token["dep"]
                    governor = 0 if token["head"] == token["id"] else (token["head"] + 1)  # CoreNLP index = pipeline index +1
                    governorGloss = "ROOT" if token["head"] == token["id"] else text[token_lib[token["head"]]["start"]:
                                                                                     token_lib[token["head"]]["end"]]
                    dependent = token["id"] + 1
                    dependentGloss = text[token["start"]:token["end"]]

                    # >>> Extract lemma
                    lemma = doc[token["id"]].lemma_

                    # 4. Add dependencies
                    basicDependencies.append({"dep": dep,
                                              "governor": governor,
                                              "governorGloss": governorGloss,
                                              "dependent": dependent,
                                              "dependentGloss": dependentGloss})
                    # 5. Add tokens
                    out_token = {"index": token["id"] + 1,
                                 "word": dependentGloss,
                                 "originalText": dependentGloss,
                                 "characterOffsetBegin": token["start"],
                                 "characterOffsetEnd": token["end"]}

                    # 6. Add lemmas
                    if "lemma" in operations:
                        out_token["lemma"] = lemma

                    # 7. Add POS tagging
                    if "pos" in operations:
                        out_token["pos"] = token["tag"]

                    # 8. Add NER
                    if "ner" in operations:
                        out_token["ner"] = ner

                    # Update output json
                    out_sentence["tokens"].append(out_token)

            # 9. Add dependencies operation
            if "parse" in operations:
                out_sentence["parse"] = parse
                out_sentence["basicDependencies"] = basicDependencies
                out_sentence["enhancedDependencies"] = out_sentence["basicDependencies"]
                out_sentence["enhancedPlusPlusDependencies"] = out_sentence["basicDependencies"]
        # ---------------------------------------- OPERATIONS  ---------------------------------------- #
        return output_json
    
    def spacy_clean(self, input_sentences):
        batch_size = min(int(np.ceil(len(input_sentences)/100)), 500)
        
        # Part 1: generate spacy textual features (pos, ner, lemma, dependencies)
        sentences = [self.spacy_generate_features(doc) for doc in nlp.pipe(input_sentences, batch_size=batch_size, n_process=-1)]
        
        # Part 2: collect all the features for each sentence
        spacy_sentences = [self.spacy_get_pos_list(sent) for sent in sentences]

        return spacy_sentences


    ## MAIN ##
    def run_pipeline(self, data, text_col, operation=['deep']):
        """Main module to execute pipeline. Accepts list of strings, and desired operation."""

        if operation=="":
            raise Exception("Please pass a cleaning type - `basic`, `deep` or `spacy` !!")
        if not isinstance(operation, list):
            operation = [operation]
        operation = list(map(str.lower, operation))

        # RAW SENTENCES
        sentences, lang_masker = data[text_col], ""

        # run basic cleaning
        if "basic" in operation:
            sentences = self.basic_clean(sentences)

        # run deep cleaning
        if "deep" in operation:
            sentences, lang_masker = self.deep_clean(sentences), self.lang

        # run spacy pipeline
        if "spacy" in operation:
            sentences = self.spacy_clean(sentences)

        data["Processed_%s" % text_col], data['lang_mask'] = sentences, lang_masker
        return data



# :: SAMPLE ::
# preprocessText_obj = preprocessText(nlp_resources_fp, custom_vocab, MIN_TOKENS, MAX_TOKENS, do_lemmatizing)
# data_clean = preprocessText_obj.run_pipeline(dataset, <TEXT_COL>, ["deep"])

#### Execute Preprocessing Module

##### :: configuration ::

In [61]:
"""
CUSTOM VOCABULARY ::
- List of words you wish to mark and retain them across the preprocessing steps - very important!
- Example, task-specific, domain-specific keywords. 
"""

# Preserve: question words for FAQs.
# custom_vocab = ["who", "what", "where", "when", "would", "which", "how", "why", "can", "may", 
#                 "will", "won't", "does", "does not","doesn't", "do", "do i", "do you", "is it", "would you", 
#                 "is there", "are there", "is it so", "is this true", "to know", "is that true", "are we", 
#                 "am i", "question is", "can i", "can we", "tell me", "can you explain", "how ain't", 
#                 "question", "answer", "questions", "answers", "ask", "can you tell"]


# Preserve: CUBoulder related words.
custom_vocab = ["sko", "buff", "buffs", "cu", "cub", "uni", "univer", "cubb", "ucb", "skobuff", "skobuffs", 
                "cuboulder", "colorado", "boulder", "footb", "ds", "onl", "res"]


"""
Utilities:
- Truncate words to their root-known-word form, stripping off their adjectives, verbs, etc.
"""

MIN_TOKENS = 5
MAX_TOKENS = 50
do_lemmatizing = True
# do_chinking = False
# do_chunking = False
# do_dependencyParser = False

##### :: run ::

In [62]:
preprocessText_obj = preprocessText(nlp_resources_fp, custom_vocab, MIN_TOKENS, MAX_TOKENS, do_lemmatizing)

start_time = time.time()
df_clean = preprocessText_obj.run_pipeline(df, "tweet", ["deep"])

print("Cleaned! Total time taken (seconds) = ", time.time() - start_time)

100%|█████████████████████████████████████████| 500/500 [00:18<00:00, 27.66it/s]

Cleaned! Total time taken (seconds) =  18.079297065734863





In [385]:
# Final preprocessed data

df = df_clean.dropna(subset=['Processed_tweet']).reset_index(drop=True)
df.shape

(500, 4)

In [386]:
df.head(10)

Unnamed: 0,tweet,label,Processed_tweet,lang_mask
0,"All campus dining locations are closed today, ...",cu_others,campus dining location close today jan 1 happy...,en
1,"What are your campus dining options today, Jan...",cu_others,what your campus dining option today jan 2 alf...,en
2,#FPGA Design for #Embedded #Systems\n\n#SoC #V...,cu_online,fpga design embed system soc verilog vlsi asic...,en
3,"What are your campus dining options today, Jan...",cu_others,what your campus dining option today jan 3 alf...,en
4,As an anthro PhD student I’m frequently asked ...,cu_research,anthro phd student m frequently ask why study ...,en
5,True or False: Concussions for college student...,cu_research,true false concussion college student signific...,en
6,@michaelgrandner @CUBoulder Very interesting w...,cu_research,interesting work additional tool investigation...,en
7,@ShellyMBoulder Thanks so much @ShellyMBoulder...,cu_research,thank much fun talk research,en
8,"@LiberalsAreMean @CUBoulder Ha! In my day job,...",cu_research,ha day job turbulence model computational flui...,en
9,Dr. Hutchison speaks to @5280Magazine about th...,cu_online,dr hutchison speak cannabis health course cour...,en


---

## 5. Set Configuration Parameters

Ideally, should store in os.environ or configParser object

##### ::  config  ::    

In [387]:
# define these
colname_txt = "tweet"
colname_clean_txt = "Processed_tweet"

# these will be created run-time
colname_id = "UID"
colname_dup_id_col = "dup_idx"
colname_similar_id_col = "similar_idx"
colname_dup_similar_id_col = "dup_similar_idx"
colname_duplicate_cluster_id = "dup_cluster_id"
colname_freq = "memberCount"
colname_coverage = "coverage"
colname_cluster_id = "cluster_id"
kmeans_k = 0
kmeans_rate = 5
kmeans_seedInit = 50
kmeans_maxIter = 1000
kmeans_clusterLen = 20
kmeans_cohesion_threshold = 0.70
kmeans_len2Level = 100
kmeans_lenThreshold = 2000

In [388]:
def create_config(data):
    config = dict()
    # FIRST, create unique id for each text record
    data.insert(0, colname_id, ['uidx_%s' % i for i in range(len(data))]) 
    config['colname_id'] = colname_id
    config['colname_txt'] = colname_txt
    config['colname_clean_txt'] = colname_clean_txt
    config['colname_id'] = colname_id
    config['colname_dup_id_col'] = colname_dup_id_col
    config['colname_similar_id_col'] = colname_similar_id_col
    config['colname_dup_similar_id_col'] = colname_dup_similar_id_col
    config['colname_duplicate_cluster_id'] = colname_duplicate_cluster_id
    config['colname_freq'] = colname_freq
    config['colname_coverage'] = colname_coverage
    config['colname_cluster_id'] = colname_cluster_id
    config['kmeans_k'] = kmeans_k
    config['kmeans_rate'] = kmeans_rate
    config['kmeans_seedInit'] = kmeans_seedInit
    config['kmeans_maxIter'] = kmeans_maxIter
    config['kmeans_clusterLen'] = kmeans_clusterLen
    config['kmeans_cohesion_threshold'] = kmeans_cohesion_threshold
    config['kmeans_len2Level'] = kmeans_len2Level
    config['kmeans_lenThreshold'] = kmeans_lenThreshold
    config['nlp_resources_fp'] = nlp_resources_fp
    config['spacy_model_data_path'] = spacy_model_data_path
    return config


config = create_config(df)

## 6. Vectorization

Context:


- An $embedding$ is a numerical representation of words, phrases, or sentences that captures the meaning and context (both semantic and syntactic) of the text in a high-dimensional space. 


- In the case of a document (one or more sentences), an $embedding$ represents the meaning of these sentences in a way that can be processed by machine learning models.


- A $Transformer$ model is a type of neural network that is commonly used for natural language processing. One of the main features of the Transformer model is its ability to generate word embeddings that capture the meaning and context of a sentence. These embeddings are generated by processing the sentence through the layers of the Transformer model.


Our data:

- In our dataset (tweets) each sentence have been vectorized into a vector of [1 x 768] dimensions.


- When we say that a text sentence is embedded into $768 \space dimensions$ by a pre-trained Transformer model, we mean that the sentence is **now represented as a vector of 768 numbers**, where each number in the vector corresponds to a different dimension in the embedding space. 


- These dimensions were learned by the Transformer model during training and represent different aspects of the meaning and context of the sentence.


- For example, one dimension of the embedding vector might correspond to the presence of a certain topic or concept in the sentence, while another dimension might correspond to the sentiment or emotional tone of the sentence. By combining these different dimensions in a high-dimensional space, the Transformer model is able to capture the complex and nuanced meaning of a sentence.


- In *extremely* simple words, 

            text      = "Coach Prime era has begun!"
                        
                        go  now coach prime have has  era  begin  end ....]
        word_vector   = [0  0   1     1     0    1    1    1      0   ....]     => One-Hot-Encoding
        
        topics        =   start  end   sentiment   complexity  presence_of_Adjective_followed_by_a_Noun ...
        
        embedding     =  [0.38   0      0.56         -0.32       0.87      0   ]
        
                      = a meaningful vector representing meaning & context of the sentence in 768 numbers.

In [389]:
################################################################################################
# List of Vectorizer Structures 
# choose - [ 'tfidf', 'bert', 'word2vec', 'doc2vec' ]
################################################################################################

class Bert_Vectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer, model, max_length=128, embedding_func: Optional[Callable[[torch.Tensor], torch.Tensor]]=None):
        self.tokenizer = tokenizer
        self.model = model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func
        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text):
        # Mean Pooling - Take attention mask into account for correct averaging
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            return sum_embeddings / sum_mask

        # Tokenize the text with the provided tokenizer
        encoded_input = self.tokenizer(text, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')

        # Compute token embeddings
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        # Perform mean pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

        # bert takes in a batch so we need to unsqueeze the rows
        return sentence_embeddings

    def transform(self, text: List[str], formatt='tensor'):
        """ MODIFIED LATEST JAN 27, 2023"""
        if isinstance(text, pd.Series):
            text = text.tolist()

        # default previously returned embeddings
        embeddings = self._tokenize(text)

        # new **modified**
        if formatt:
            formatt = str(formatt).strip().lower()
            if formatt == 'tensor':
                return embeddings
            elif formatt == 'numpy':
                return embeddings.numpy()
            elif formatt == 'csr':
                embeddings_matrix = Functional.normalize(embeddings, p=2, dim=1)
                embeddings_matrix_csr = csr_matrix(embeddings_matrix.numpy().astype(np.float64))
                return embeddings_matrix_csr
            else:
                raise Exception("Invalid input for formatt!")

    def fit(self, X, y=None):
        """No fitting (fine-tuning) is required"""
        return self

class Tfidf_Vectorizer():
    def __init__(self, ngram_size=3):
        self.ngram_size = ngram_size
        self.regex = r'[,-./]|\s'

    def get_n_grams(self, string):
        if string is not None: 
            string = string.lower()
        string = re.sub(self.regex, r'', string)
        n_grams = zip(*[string[i:] for i in range(self.ngram_size)])
        return [''.join(n_gram) for n_gram in n_grams]
    
    def fit(self, txt):
        self.vec = TfidfVectorizer(min_df=1, analyzer=self.get_n_grams, dtype=np.float64)
        return self.vec.fit(txt)

    def transform(self, txt):
        return self.vec.transform(txt)


class Word2vec_Vecotrizer:
    def __init__(self):
        print("Define a word2vec model here along with fit, transform operations.")


class Doc2vec_Vecotrizer:
    def __init__(self):
        print("Define a Doc2vec model here along with fit, transform operations.")
        
    


In [390]:
# Set these after training/using vectorizer models:

config['vectorizer_model_fp'] = {'tfidf': None, 
                                 'bert': sbert_model_fp, 
                                 'word2vec': None, 
                                 'doc2vec': None}

## 7. Computing Duplicates and Near-Duplicates

One aspect of textual clustering is to aviod unwanted data-redundancy, which means, since we have already preprocessed our sentences (stripped of various adulterations and common words), multiple sentences might now be reduced to a very similar generic form and can be counted as duplicates or near-duplicates. 

- Performing clustering on such sparse matrix [n documents x 768] dimensions which might have near duplicates can significantly affect memory and space requirements. Hence we drop the near-duplicates processed tweets in this section.

This is a two-part process.
1. Part 1 involves computing pair-wise sentence-sentence similarity.
2. Part 2 involves ranking similar sentences and consolidating them together as duplicates or near-duplicates.

### Part 1: Computing pair-wise similarity

In [391]:
################################################################################################
# Pair-Wise Similarity Computation Structure 
################################################################################################
class Generate_Similarity_Matrix(object):
    
    def __init__(self, 
                 master, 
                 master_id=None, 
                 duplicates=None, 
                 duplicates_id=None, 
                 min_similarity=0.80, 
                 do_vectorize=True, 
                 vectorizer=None, 
                 vectorizer_pre_model_fp=None, 
                 vectorized_master_fp=None, 
                 vectorized_duplicates_fp=None):

        # UTILITY FUNCTIONS
        def _is_series_of_strings(series_to_test: pd.Series):
            if not isinstance(series_to_test, pd.Series):
                return False
            elif series_to_test.to_frame().applymap(lambda x: not isinstance(x, str)).squeeze(axis=1).any():
                return False
            return True

        def _is_input_data_combination_valid(duplicates, master_id, duplicates_id):
            if duplicates is None and (duplicates_id is not None) or duplicates is not None and (
                    (master_id is None) ^ (duplicates_id is None)):
                return False
            else:
                return True

        # VALIDATE INPUT ARGS
        if not _is_series_of_strings(master) or (duplicates is not None and not _is_series_of_strings(duplicates)):
            raise TypeError('Input does not consist of pandas.Series containing only Strings')
        if not _is_input_data_combination_valid(duplicates, master_id, duplicates_id):
            raise Exception('List of data Series options is invalid')
        if master_id is not None and len(master) != len(master_id):
            raise Exception('Both master and master_id must be pandas.Series of the same length.')
        if duplicates is not None and duplicates_id is not None and len(duplicates) != len(duplicates_id):
            raise Exception('Both duplicates and duplicates_id must be pandas.Series of the same length.')
        if do_vectorize and vectorizer is None:
            raise Exception("Define a vectorizer engine using 'vectorizer=' first!")
        if do_vectorize and vectorizer.lower().strip() not in ['tfidf', 'word2vec', 'doc2vec', 'bert']:
            raise Exception("Use a vectorizer from available list of engines defined!")
        if do_vectorize and vectorizer.lower() == 'bert' and vectorizer_pre_model_fp is None:
            raise Exception("Using Transformers Model, define sBert fp using 'vectorizer_pre_model_fp=' first!")
        if do_vectorize is False and vectorized_master_fp is None:
            raise Exception("Provide path of master_file vectorized pickle obj, using 'vectorized_master_fp='")
        if do_vectorize is False and duplicates is not None and vectorized_duplicates_fp is None:
            raise Exception("Provide path of dup_file vectorized pickle obj, using 'vectorized_duplicates_fp='")

        # SAVE INPUT ARGS
        self._master = master
        self._duplicates = duplicates if duplicates is not None else None
        self._master_id = master_id if master_id is not None else None
        self._duplicates_id = duplicates_id if duplicates_id is not None else None
        self.min_similarity = min_similarity
        self.do_vectorize = do_vectorize
        self.vectorizer_name = vectorizer.lower().strip() if vectorizer else None
        self.vectorizer_pre_model_fp = vectorizer_pre_model_fp
        self.master_vector_fp = vectorized_master_fp
        self.duplicates_vector_fp = vectorized_duplicates_fp

        # CONFIG
        self._true_max_n_matches = None
        self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates)
        self.number_of_processes = multiprocessing.cpu_count() - 1
        self.DEFAULT_COLUMN_NAME = 'side'
        self.DEFAULT_ID_NAME = 'id'
        self.LEFT_PREFIX = 'left_'
        self.RIGHT_PREFIX = 'right_'
        self._matches_list = pd.DataFrame()
        self.is_build = False  # indicates if fit has been called or not


        # ----------------------------------------------------------------------------- #
        # INIT VECTORIZER ENGINE
        # ----------------------------------------------------------------------------- #
        ##########################################
        # Option 1: Vectorize during run-time!
        ##########################################
        if self.do_vectorize:
            if self.vectorizer_name == "tfidf":
                self.ngram_size = 3
                self._vectorizer = Tfidf_Vectorizer(self.ngram_size)
            if self.vectorizer_name == "word2vec":
                raise Exception("Word2vec is not yet defined! Define first in 'Word2vec_Vecotrizer' !")
            if self.vectorizer_name == "doc2vec":
                raise Exception("Doc2vec is not yet defined! Define first in 'Doc2vec_Vecotrizer' !")
            if self.vectorizer_name == "bert":
                tokenizer = AutoTokenizer.from_pretrained(self.vectorizer_pre_model_fp)
                model_bert = AutoModel.from_pretrained(self.vectorizer_pre_model_fp)
                self._vectorizer = Bert_Vectorizer(tokenizer, model_bert, embedding_func=lambda x: x[0][:, 0, :].squeeze())

        ##########################################
        # Option 2: Load vectorized file!
        ##########################################
        else:
            # load master file's vectorized object
            with open(self.master_vector_fp, 'rb') as f:  
                self.master_matrix_loaded_embed = pickle.load(f)
            # load dup file's vectorized object
            if self._duplicates is not None:              
                with open(self.duplicates_vector_fp, 'rb') as f:
                    self.duplicate_matrix_loaded_embed = pickle.load(f)
            else:
                # IF there is no duplicate matrix, match on the master matrix itself!
                self.duplicate_matrix_loaded_embed = self.master_matrix_loaded_embed
        # ----------------------------------------------------------------------------- #
        return

    def fit(self):
        """
        Fit a vectorizer (already init) with Master & Duplicates matrix and calculate cosine-sim without original-ids.
        Params  : Master, Duplicates
        Return  : dataframe{ Master_Text, Duplicates_Text, cosine_sim(vectorizer_master, vectorized_duplicates) }

        """
        # UTILITY FUNCTIONS
        def fix_diagonal(m: lil_matrix):
            r = np.arange(m.shape[0])
            m[r, r] = 1
            return m

        def symmetrize_matrix(m_symmetric: lil_matrix):
            r, c = m_symmetric.nonzero()
            m_symmetric[c, r] = m_symmetric[r, c]
            return m_symmetric

        # Vectorize the matrices
        # - if duplicate matrix is present use it, else utilize master itself
        master_matrix, duplicate_matrix = self.get_vectorized_matrices()

        # Calculate cosine similarity b/w master & duplicates (if passed, else use master itself)
        matches = self.build_matches(master_matrix, duplicate_matrix)
        self._true_max_n_matches = self._max_n_matches - 1

        # Correct sparse matrix multiplcation
        if self._duplicates is None:
            # convert to lil format for best efficiency when setting matrix-elements
            # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by floating-point computations
            #                                             in awesome_cossim_topn sometimes lead to unexpected results)
            matches = matches.tolil()
            matches = fix_diagonal(matches)
            if self._max_n_matches < self._true_max_n_matches:
                matches = symmetrize_matrix(matches)
            matches = matches.tocsr()

        # Create the basic "matches" dataframe with "Master, Duplicate and Similarity" cols only
        r, c = matches.nonzero()
        self._matches_list = pd.DataFrame(
            {'master_side': r.astype(np.int64), 'dupe_side': c.astype(np.int64), 'similarity': matches.data})
        self.is_build = True
        return self

    def get_vectorized_matrices(self):
        """
        Vectorize matrices using one of the vectorizers or load vectorized text.
        Params    : Master, Duplicates, Vectorizer_name("tfidf", "bert", "word2vec", "doc2vec")
        Return    : vectorizer_master, vectorized_duplicates
        """
        def check_csr(matrix):
            """
            Ideally every vectorized matrix should be in a CSR format for faster computations.
            """
            if isinstance(matrix, np.ndarray):      # numpy format -> CSR
                return csr_matrix(Functional.normalize(torch.from_numpy(matrix), p=2, dim=1).numpy().astype(np.float64))
            elif torch.is_tensor(matrix):            # pyTorch.tensor format -> CSR
                return csr_matrix(Functional.normalize(matrix, p=2, dim=1).numpy().astype(np.float64))
            elif isinstance(matrix, csr_matrix):     # CSR format
                return matrix
            else:
                raise Exception("Vectorized matrix format not identitifed! Check CSR properties.")

        def fit_vectorizer():
            # if both master & duplicates series are set - concat them to fit the vectorizer on all strings at once!
            if self._duplicates is not None:
                strings = pd.concat([self._master, self._duplicates])
            else:
                strings = self._master
            self._vectorizer.fit(strings)
            return self._vectorizer

        if self.do_vectorize:
            """
            Vectorization during run-time!
            """
            if self.vectorizer_name == "tfidf":
                print("Vectorizing: tfidf")
                self._vectorizer = fit_vectorizer()
                master_matrix = self._vectorizer.transform(self._master)
                if self._duplicates is not None:
                    duplicate_matrix = self._vectorizer.transform(self._duplicates)
                else:
                    # IF there is no duplicate matrix, match on the master matrix itself!
                    duplicate_matrix = master_matrix

            if self.vectorizer_name == "bert":
                print("Vectorizing: bert")
                master_matrix = check_csr(self._vectorizer.transform(self._master, formatt='csr'))
                if self._duplicates is not None:
                    duplicate_matrix = check_csr(self._vectorizer.transform(self._duplicates, formatt='csr'))
                else:
                    # IF there is no duplicate matrix, match on the master matrix itself!
                    duplicate_matrix = master_matrix
                    
            if self.vectorizer_name == "word2vec":
                raise Exception("Word2vec is not yet defined! Define first in 'Word2vec_Vecotrizer' !")
            
            if self.vectorizer_name == "doc2vec":
                raise Exception("Doc2vec is not yet defined! Define first in 'Doc2vec_Vecotrizer' !")     
        else:
            """
            Using prevectorized text.
            """
            master_matrix = check_csr(self.master_matrix_loaded_embed)
            duplicate_matrix = check_csr(self.duplicate_matrix_loaded_embed)

        return master_matrix, duplicate_matrix

    def build_matches(self, master_matrix, duplicate_matrix):
        """
        Builds the cosine similarity matrix of two CSR matrices.
        Params   : vectorizer_master, vectorized_duplicates
        Return   : cosine_sim(vectorized_master, vectorized_duplicates)
        """
        # Matrix A, B
        tf_idf_matrix_1 = master_matrix
        tf_idf_matrix_2 = duplicate_matrix.transpose()

        # Calculate cosine similarity
        optional_kwargs = {'use_threads': self.number_of_processes > 1, 'n_jobs': self.number_of_processes}
        cosine_sim_matrix = awesome_cossim_topn(tf_idf_matrix_1,
                                                tf_idf_matrix_2,
                                                self._max_n_matches,
                                                self.min_similarity,
                                                **optional_kwargs)
        return cosine_sim_matrix

    def get_matches(self):
        """
        Creates the complete dataframe with index matching(ids) if passed.
        Params  : dataframe
        Return  : dataframe{ Master_ids, Master_Text, cosine_similarity, Duplicate_ids, Duplicates_Text }
        """
        # UTILITY FUNCTIONS
        def get_both_sides(master, duplicates, generic_name=(self.DEFAULT_COLUMN_NAME, self.DEFAULT_COLUMN_NAME),
                           drop_index=False):
            lname, rname = generic_name
            left = master if master.name else master.rename(lname)
            left = left.iloc[matches_list.master_side].reset_index(drop=drop_index)
            if self._duplicates is None:
                right = master if master.name else master.rename(rname)
            else:
                right = duplicates if duplicates.name else duplicates.rename(rname)
            right = right.iloc[matches_list.dupe_side].reset_index(drop=drop_index)
            return left, (right if isinstance(right, pd.Series) else right[right.columns[::-1]])

        def prefix_column_names(data, prefix):
            if isinstance(data, pd.DataFrame):
                return data.rename(columns={c: f"{prefix}{c}" for c in data.columns})
            else:
                return data.rename(f"{prefix}{data.name}")

        if self.min_similarity > 0:
            matches_list = self._matches_list
        else:
            raise Exception("min_similarity cannot be set to less than or equal to 0!")

        # ID Retrival
        left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=False)
        similarity = matches_list.similarity.reset_index(drop=True)
        if self._master_id is None:
            # if ids are not passed
            return pd.concat([prefix_column_names(left_side, self.LEFT_PREFIX),
                              similarity,
                              prefix_column_names(right_side, self.RIGHT_PREFIX)], axis=1)

        else:
            # if ids are passed, retrive ids
            left_side_id, right_side_id = get_both_sides(self._master_id, self._duplicates_id,
                                                         (self.DEFAULT_ID_NAME, self.DEFAULT_ID_NAME), drop_index=True)
            return pd.concat([prefix_column_names(left_side, self.LEFT_PREFIX),
                              prefix_column_names(left_side_id, self.LEFT_PREFIX),
                              similarity,
                              prefix_column_names(right_side_id, self.RIGHT_PREFIX),
                              prefix_column_names(right_side, self.RIGHT_PREFIX)], axis=1)

    def run(self):
        """
        Compute pair-wise similarity.
        """
        st = time.time()
        self.fit()
        sim_df = self.get_matches()
        print("Total time taken (mins): ", (time.time() - st)/60)
        return sim_df



## :: SAMPLE RUN ::

# Run mode 1: 
# Semantic Similarity among all strings of file_A using runtime vectorization:
# matches = Generate_Similarity_Matrix(master = df[<_text_col_>], master_id = df[<_id_col_>], 
#                                      min_similarity=0.80, do_vectorize=True, vectorizer='bert|tfidf|word2vec')
# matches.run()

# Run mode 2: 
# Semantic Similarity among all strings of file_A using prevectorized text:
# matches = Generate_Similarity_Matrix(master = df[<_text_col_>], master_id = df[<_id_col_>], 
#                                      min_similarity=0.80, do_vectorize=False, vectorized_master_fp="PATH")
# matches.run()

# Run mode 3: 
# Semantic similarity between two files A and B using runtime vectorization:
# matches = Generate_Similarity_Matrix(master = df[__text_col_A__], master_id = df[__id_col_A__], 
#                                      duplicates = df[__text_col_B__], duplicates_id = df[__id_col_B__], 
#                                      min_similarity=0.85, do_vectorize=True, vectorizer='bert|tfidf|word2vec')
# matches.run()

# Run mode 4: 
# Textual Semantic similarity between two files A and B using prevectorized text:
# matches = Generate_Similarity_Matrix(master = df[__text_col_A__], master_id = df[__id_col_A__], 
#                                      duplicates = df[__text_col_B__], duplicates_id = df[__id_col_B__], 
#                                      min_similarity=0.85, do_vectorize=False, 
#                                      vectorized_master_fp='PATH', vectorized_duplicates_fp='PATH')
# matches.run()

# Run mode 5: 
# Textual Semantic similarity between two files A and B with no ids, using prevectorized text:
# matches = Generate_Similarity_Matrix(master = df[__text_col_A__], duplicates = df[__text_col_B__]
#                                      min_similarity=0.85, do_vectorize=False, 
#                                      vectorized_master_fp='PATH', vectorized_duplicates_fp='PATH')
# matches.run()

### Part 2: Finding duplicates and near-duplicates

In [392]:
################################################################################################
# Use Pair-Wise Similarity to find near-duplicates
################################################################################################
class find_duplicates:
    
    def __init__(self, data, vectorizer, config, sim_threshold=0.90):
        
        self.org_data = self.data = data
        self.sim_cutoff = sim_threshold
        self.vectorizer = vectorizer.lower().strip()
        if self.vectorizer not in ['tfidf', 'bert', 'word2vec', 'doc2vec']:
            raise Exception("Invalid vectorizer passed!")
        
        self.col_id = config.get('colname_id')
        self.col_clean_txt = config.get('colname_clean_txt')
        self.col_dup_id = config.get('colname_dup_id_col')
        self.col_sim_id = config.get('colname_similar_id_col')
        self.col_dup_sim_id = config.get('colname_dup_similar_id_col')
        self.col_cluster_id = config.get('colname_duplicate_cluster_id')
        self.vectorizer_model_fp = config['vectorizer_model_fp'][self.vectorizer]
    
    
    def collect_dups(self):
        """
        # 1. collect duplicate ids based on "text" col
        """
        dup_dict = self.data.reset_index()\
                    .groupby(self.data[self.col_clean_txt].tolist())[self.col_id]\
                    .agg(list)\
                    .reset_index().reset_index(drop=True)\
                    .rename(columns={"index": self.col_clean_txt, self.col_id: self.col_dup_id})
        dup_dict = dup_dict.set_index(self.col_clean_txt)[self.col_dup_id].to_dict()
        self.data[self.col_dup_id] = self.data[self.col_clean_txt].apply(lambda txt: dup_dict[txt])
        return
   
    def drop_dups(self):
        """
        # 2. drop dup ids, keep first
        """
        self.data = self.data.drop_duplicates(subset=[self.col_dup_id]).reset_index(drop=True)
        return

    def pairwise_similarity_matrix(self):
        """
        # 3. collect pair-wise similairty matches above 'sim_threshold'
        """
        print("Computing pair-wise similarity matrix above score: ", self.sim_cutoff)
        # Pair-wise textual similarity
        pwsim = Generate_Similarity_Matrix(master = self.data[self.col_clean_txt], 
                                           master_id = self.data[self.col_id], 
                                           min_similarity = self.sim_cutoff, 
                                           do_vectorize = True,
                                           vectorizer = self.vectorizer,
                                           vectorizer_pre_model_fp = self.vectorizer_model_fp)
        matches = pwsim.run()

        # group similar-pairs together (left-join)
        left_col_name, left_unique_id, right_unique_id = "left_%s" % self.col_clean_txt, "left_%s" % self.col_id, "right_%s" % self.col_id
        match_df = matches.groupby([left_col_name, left_unique_id])[right_unique_id]\
                          .agg(similar_idx = lambda x: sorted(set(x)))\
                          .reset_index()\
                          .sort_values(by=[left_unique_id], ascending=True)\
                          .reset_index(drop=True)

        # asthestic: drop dummy added left/right names
        matches = matches.drop(columns=['left_index', "right_index"])
        match_df = match_df.rename(columns={left_unique_id: self.col_id, left_col_name: self.col_clean_txt})
        return matches, match_df
    
    def natural_sort_key(self, s):
        """
        ## Utility: alphanumeric sort
        """
        _nsre = re.compile('([0-9]+)')
        return [int(text) if text.isdigit() else text.lower() for text in re.split(_nsre, s)]
        
    def combine_dup_similar(self, match_df):
        """
        # 4. create "dup_similar_idx" col - merge dup_id data with similar_id data
        """
        # merge df_duplicates with df_similar
        cols_to_use = [self.col_id, self.col_sim_id]
        self.data = self.data.merge(match_df[cols_to_use], on=self.col_id, how='outer')
        # create combined list == "duplicated_pairs_idx" + "similar_pairs_idx
        self.data[self.col_dup_sim_id] = [sorted(set(sum(tup, []))) for tup in zip(self.data[self.col_dup_id], self.data[self.col_sim_id])]
        # custom sorting (to handle alphanumeric ids)
        if isinstance(self.data[self.col_dup_sim_id][0], str):
            self.data[self.col_dup_sim_id] = self.data[self.col_dup_sim_id].apply(lambda x: sorted(x, key=natural_sort_key))
        return
    
    def collect_similar_ids(self):
        """
        # 5. merged all nested lists containing common sub-elements in "dup_similar_id" cols
        """
        # collect nested list which needs to be merged
        list_similar_ids = list(map(list, self.data[self.col_dup_sim_id]))

        # merge all nested lists with common elements
        g = nx.Graph()
        edges = [g.add_edges_from(zip(p, p[1:])) if len(p)>1 else g.add_edges_from(zip(p, p[:])) for p in list_similar_ids]
        merged_similar_idx = [sorted(c) for c in nx.connected_components(g)]

        # create two mappings, one for storing cluster_id: list of ids, and one inverted dict
        # --> "id_clus_dict" is the cluster id mapping for each 'unique_id'
        temp_id = 1
        clus_id_dict = {}      # cluster_1: merged([id1, id2,..., idn])
        id_clus_dict = {}      # merged(id1): cluster_1; merged(id1): cluster_1; .., merged(idn): cluster_1
        for lst in merged_similar_idx:
            key = "dup_%s" % temp_id
            for value in lst:
                id_clus_dict[value] = key
            clus_id_dict[key] = lst
            temp_id += 1 

        # assign dup_similar_idx based on two mappings above
        self.data[self.col_dup_sim_id] = self.data[self.col_id].apply(lambda uid: clus_id_dict[id_clus_dict[uid]])

        # create duplicate id mapping and similar id mapping files
        dup_id_dict = {_id: ids for ids in self.data[self.col_dup_id].tolist() for _id in ids}
        sim_id_dict = {_id: ids for ids in self.data[self.col_sim_id].tolist() for _id in ids}
        dup_sim_id_dict = {_id: ids for ids in self.data[self.col_dup_sim_id].tolist() for _id in ids}

        # custom sorting (to handle alphanumeric ids)
        if isinstance(self.data[self.col_dup_sim_id][0], str):
            self.data[self.col_dup_sim_id] = self.data[self.col_dup_sim_id].apply(lambda x: sorted(x, key=natural_sort_key))
        return clus_id_dict, id_clus_dict, dup_id_dict, sim_id_dict, dup_sim_id_dict
    
    def create_final_single_matrix(self):
        """
        # 6. Drop duplicates based on dup_similar_id_col, i.e. duplicated_id + similar_ids
        """
        self.data[self.col_dup_sim_id] = tuple(map(tuple, self.data[self.col_dup_sim_id]))
        self.data = self.data.drop_duplicates(subset=[self.col_dup_sim_id]).reset_index(drop=True)
        self.data[self.col_dup_sim_id] = list(map(list, self.data[self.col_dup_sim_id]))
        return
    
    def create_clusters(self, idx_cluster_map):
        """
        # 7. Expand each id to assign clusters
        """
        self.data[self.col_cluster_id] = self.data[self.col_id].apply(lambda uid: idx_cluster_map.get(uid, -1))
        return
    
    def make_changes_in_orig_df(self, dup_id_dict, sim_id_dict, dup_sim_id_dict, idx_cluster_map):
        """
        # 8. Over-ride found details on the original dataset
        """
        self.org_data[self.col_dup_id] = self.org_data[self.col_id].apply(lambda uid: dup_id_dict.get(uid, -1))
        self.org_data[self.col_sim_id] = self.org_data[self.col_id].apply(lambda uid: sim_id_dict.get(uid, -1))
        self.org_data[self.col_dup_sim_id] = self.org_data[self.col_id].apply(lambda uid: dup_sim_id_dict.get(uid, -1))
        self.org_data[self.col_cluster_id] = self.org_data[self.col_id].apply(lambda uid: idx_cluster_map.get(uid, -1))
        self.org_data[self.col_dup_sim_id] = tuple(map(tuple, self.org_data[self.col_dup_sim_id]))
        print("> Duplicate and very similar records identified.")
        return

    def create_cluster_data(self):
        """
        # 9. Create final Clustering Dataset
             - drop duplicates and very similar rows, they can be mapped back using 'dup_similar_idx'
        """
        df_cluster = self.org_data.drop_duplicates(subset=[self.col_dup_sim_id]).reset_index(drop=True)
        print('> Dups & very similar roecrds dropped. Final Unique df_cluster Shape = ', df_cluster.shape)
        return df_cluster

    def run_stats(self):
        """
        # 10. Display Analytics
        """
        print("Stats:\n"
              "\nOrignal number of records = {}"
              "\nTotal dup count = {}"
              "\nTotal similar pairs found = {}"
              "\nFinal number of records post dup-similar rows removal = {}"\
              .format(len(self.org_data), 
                      len(sum(self.data[self.col_dup_id].tolist(), [])), 
                      len(sum(self.data[self.col_sim_id].tolist(), [])),
                      len(self.data)))
        return
    
    
    def EXE(self):
        print("Original shape #records: ", self.org_data.shape[0])
        
        # collect dup rows
        self.collect_dups()
        
        # drop dup rows
        self.drop_dups()
        print("Exact duplicates dropped. New shape #records: ", self.data.shape[0])

        # compute pair-wise similarity for remanining rows
        matches, match_df = self.pairwise_similarity_matrix()
        
        # merge dup_ids with very_similar_ids
        self.combine_dup_similar(match_df)
    
        # merged nxn graphs for "dup_similar_id" cols
        cluster_id_map, \
        idx_cluster_map, \
        dup_id_dict, \
        sim_id_dict, \
        dup_sim_id_dict = self.collect_similar_ids()
        
        # drop duplicates and near-duplicates based on dup_similar_id_col, i.e. duplicated_id + similar_ids
        self.create_final_single_matrix()
        
        # expand each id to assign clusters
        self.create_clusters(idx_cluster_map)
        
        # include changes in original data
        self.make_changes_in_orig_df(dup_id_dict, sim_id_dict, dup_sim_id_dict, idx_cluster_map)
        
        # drop duplicate_similar_idx rows
        df_cluster = self.create_cluster_data()
            
        # stats
        self.run_stats()
        
        return self.org_data, df_cluster

##### Execute: Two-part Near-Dup computation

- Indetify duplicates and **very similar** records (with strict similarity around 90-95, else will contain noise), and collect their ids.
- So that clustering can run on remaining **unique** records. We can later assign the same cluster_id to collected 'dup_similar_idx' ones.

In [393]:
dup_finder = find_duplicates(df, 'bert', config, sim_threshold=0.90)
df, df_cluster = dup_finder.EXE()
print(df.shape, "--->", df_cluster.shape)

# Save a copy!
df.to_csv(os.path.join(data_dir, "tweets_preprocessed.csv"), index=False)
df_cluster.to_csv(os.path.join(data_dir, "tweets_cluster_final.csv"), index=False)

Original shape #records:  500
Exact duplicates dropped. New shape #records:  469
Computing pair-wise similarity matrix above score:  0.9
Vectorizing: bert
Total time taken (mins):  0.7107892870903015
> Duplicate and very similar records identified.
> Dups & very similar roecrds dropped. Final Unique df_cluster Shape =  (456, 9)
Stats:

Orignal number of records = 500
Total dup count = 481
Total similar pairs found = 469
Final number of records post dup-similar rows removal = 456
(500, 9) ---> (456, 9)


In [394]:
df.head(10)

Unnamed: 0,UID,tweet,label,Processed_tweet,lang_mask,dup_idx,similar_idx,dup_similar_idx,dup_cluster_id
0,uidx_0,"All campus dining locations are closed today, ...",cu_others,campus dining location close today jan 1 happy...,en,[uidx_0],[uidx_0],"(uidx_0,)",dup_1
1,uidx_1,"What are your campus dining options today, Jan...",cu_others,what your campus dining option today jan 2 alf...,en,[uidx_1],"[uidx_1, uidx_3]","(uidx_1, uidx_3)",dup_2
2,uidx_2,#FPGA Design for #Embedded #Systems\n\n#SoC #V...,cu_online,fpga design embed system soc verilog vlsi asic...,en,[uidx_2],[uidx_2],"(uidx_2,)",dup_3
3,uidx_3,"What are your campus dining options today, Jan...",cu_others,what your campus dining option today jan 3 alf...,en,[uidx_3],"[uidx_1, uidx_3]","(uidx_1, uidx_3)",dup_2
4,uidx_4,As an anthro PhD student I’m frequently asked ...,cu_research,anthro phd student m frequently ask why study ...,en,[uidx_4],[uidx_4],"(uidx_4,)",dup_4
5,uidx_5,True or False: Concussions for college student...,cu_research,true false concussion college student signific...,en,[uidx_5],[uidx_5],"(uidx_5,)",dup_5
6,uidx_6,@michaelgrandner @CUBoulder Very interesting w...,cu_research,interesting work additional tool investigation...,en,[uidx_6],[uidx_6],"(uidx_6,)",dup_6
7,uidx_7,@ShellyMBoulder Thanks so much @ShellyMBoulder...,cu_research,thank much fun talk research,en,[uidx_7],[uidx_7],"(uidx_7,)",dup_7
8,uidx_8,"@LiberalsAreMean @CUBoulder Ha! In my day job,...",cu_research,ha day job turbulence model computational flui...,en,[uidx_8],[uidx_8],"(uidx_8,)",dup_8
9,uidx_9,Dr. Hutchison speaks to @5280Magazine about th...,cu_online,dr hutchison speak cannabis health course cour...,en,[uidx_9],[uidx_9],"(uidx_9,)",dup_9


In [395]:
df_cluster.head(10)

Unnamed: 0,UID,tweet,label,Processed_tweet,lang_mask,dup_idx,similar_idx,dup_similar_idx,dup_cluster_id
0,uidx_0,"All campus dining locations are closed today, ...",cu_others,campus dining location close today jan 1 happy...,en,[uidx_0],[uidx_0],"(uidx_0,)",dup_1
1,uidx_1,"What are your campus dining options today, Jan...",cu_others,what your campus dining option today jan 2 alf...,en,[uidx_1],"[uidx_1, uidx_3]","(uidx_1, uidx_3)",dup_2
2,uidx_2,#FPGA Design for #Embedded #Systems\n\n#SoC #V...,cu_online,fpga design embed system soc verilog vlsi asic...,en,[uidx_2],[uidx_2],"(uidx_2,)",dup_3
3,uidx_4,As an anthro PhD student I’m frequently asked ...,cu_research,anthro phd student m frequently ask why study ...,en,[uidx_4],[uidx_4],"(uidx_4,)",dup_4
4,uidx_5,True or False: Concussions for college student...,cu_research,true false concussion college student signific...,en,[uidx_5],[uidx_5],"(uidx_5,)",dup_5
5,uidx_6,@michaelgrandner @CUBoulder Very interesting w...,cu_research,interesting work additional tool investigation...,en,[uidx_6],[uidx_6],"(uidx_6,)",dup_6
6,uidx_7,@ShellyMBoulder Thanks so much @ShellyMBoulder...,cu_research,thank much fun talk research,en,[uidx_7],[uidx_7],"(uidx_7,)",dup_7
7,uidx_8,"@LiberalsAreMean @CUBoulder Ha! In my day job,...",cu_research,ha day job turbulence model computational flui...,en,[uidx_8],[uidx_8],"(uidx_8,)",dup_8
8,uidx_9,Dr. Hutchison speaks to @5280Magazine about th...,cu_online,dr hutchison speak cannabis health course cour...,en,[uidx_9],[uidx_9],"(uidx_9,)",dup_9
9,uidx_10,Could fluid dynamics research pave the way for...,cu_research,could fluid dynamic research pave way intraven...,en,[uidx_10],[uidx_10],"(uidx_10,)",dup_10


---

## 8. Clustering Dataset

- 1. `tweets.csv` - original data file


- 2. `processed_Tweets.csv` - pre-processed file including dup and near-dup ids.


- 3. `cluster_finaldata_Tweets.csv` - final pre-processed file for clustering.

In [396]:
# Preprocessed Dataset
df = pd.read_csv(os.path.join(data_dir, "tweets_preprocessed.csv"))
print("-> Our preprocessed dataset has #records: ", df.shape[0])

# Final Dataset to be clustered
df_cluster = pd.read_csv(os.path.join(data_dir, "tweets_cluster_final.csv"))
print("-> Our dataset to be clustered has #records: ", df_cluster.shape[0])

-> Our preprocessed dataset has #records:  500
-> Our dataset to be clustered has #records:  456


## 9. Clustering

Clustering Algorithm:

1. Load configuration file - identify the column names (e.g. source_) that must be present in the input data schema.
2. Update clustering configuration.
3. Load the input file (csv, excel).
4. Run Similarity computation using Fast Bert to identify duplicate and very similar records in the input data.
5. Perform clustering only on unique records (dups dropped)
    
    5.1. Execute `run_clustering` - which accepts a 'preprocess' param to accept processed or un-processed text. 
    
    5.2. Prepare input data: if _preporcess=True_, execute a preprocessing module `cluster_preprocessing_run_pipeline`.
    
    5.3. Run `doc_clustering` to perform each iteration of clustering algorithm
    
    5.3. Use other helper modules `cluster_cohesion_cosine` and `cluster_center_sent` to facilate clustering process.
    
    5.4. Return cluster information on unique records.
    
6. Merge the original dataframe (with dups and very similar records) with clustered data on unique rows.
7. Consolidate cluster information on all records with above merged data.
8. Create visual reports.

### Clustering Algorithm and Utilities

In [397]:
####################################################################################
#  Clustering Utilities...
####################################################################################

# preprocessing text - Only runs when: run_clustering(df_cluster, spacy_model, preprocess=True, vectorizer='bert')
def cluster_preprocessing_run_pipeline(input_json, spacy_model):
    """
    Runs Spacy pipeline specified by params.
    param: input_json = {'text': sentence, 'props': props_str}
    return: output_json
    """
    # Get all parameters from input JSON
    text = input_json["text"]
    for prop in input_json["props"]:
        if prop["key"] == "annotators":
            operations = prop["value"].split(",")

    # Run Spacy Pipeline
    doc = spacy_model(text)
    doc_json = doc.to_json()  # Includes all info from spacy pipeline

    output_json = {}
    output_json["sentences"] = []

    # a. create necessary dictionaries
    # a.1 Extract Entity List
    entity_list = doc_json["ents"]
    # a.2 create token lib
    token_lib = {token["id"]: token for token in doc_json["tokens"]}

    # b. add sentence indices
    for i, sentence in enumerate(doc_json["sents"]):
        out_sentence = {"index": i, "line": 1, "tokens": []}
        parse = ""
        basicDependencies = []
        output_json["sentences"].append(out_sentence)

        # c. split sentences by indices, add labels (pos, ner, dep, etc.)
        for token in doc_json["tokens"]:
            if sentence["start"] <= token["start"] and token["end"] <= sentence["end"]:
                # Extract Entity label
                ner = "O"
                for entity in entity_list:
                    if entity["start"] <= token["start"] and token["end"] <= entity["end"]:
                        ner = entity["label"]
                # Extract dependency info
                dep = token["dep"]
                governor = 0 if token["head"] == token["id"] else (token["head"] + 1)  # CoreNLP index = pipeline index +1
                governorGloss = "ROOT" if token["head"] == token["id"] else text[token_lib[token["head"]]["start"]: token_lib[token["head"]]["end"]]
                dependent = token["id"] + 1
                dependentGloss = text[token["start"]:token["end"]]
                lemma = doc[token["id"]].lemma_
                # d. add dependencies
                basicDependencies.append({"dep": dep,
                                          "governor": governor,
                                          "governorGloss": governorGloss,
                                          "dependent": dependent,
                                          "dependentGloss": dependentGloss})
                # e. add tokens
                out_token = {"index": token["id"] + 1,
                             "word": dependentGloss,
                             "originalText": dependentGloss,
                             "characterOffsetBegin": token["start"],
                             "characterOffsetEnd": token["end"]}
                if "lemma" in operations:
                    out_token["lemma"] = lemma
                if "pos" in operations:
                    out_token["pos"] = token["tag"]
                if "ner" in operations:
                    out_token["ner"] = ner
                out_sentence["tokens"].append(out_token)
        if "parse" in operations:
            out_sentence["parse"] = parse
            out_sentence["basicDependencies"] = basicDependencies
            out_sentence["enhancedDependencies"] = out_sentence["basicDependencies"]
            out_sentence["enhancedPlusPlusDependencies"] = out_sentence["basicDependencies"]

    return output_json

# cohesion calcuation
def cluster_cohesion_cosine(clustering, cluster_set_by_sid, data):
    '''
    :: COESHION SCORE ::
    Calculates cohesion score for each cluster id. Accepts 1 cluster with all members, and re-runs clustering on them,
    with k=1, e.g. KMeans(cluster_id_1_documents, k=1). Calculates relative distances between each member.
    '''
    document_center = []
    values = []
    for sid in cluster_set_by_sid:
        document_center.append(data[sid]['sent_lemma'])

    # clustering one cluster members with k=1 to find relatvie distances
    model_center, cluster_labels_, cluster_centers_center, X_center = clustering.run_doc_clustering(document_center, 1)

    for i in range(0, len(cluster_set_by_sid)):
        d_vec = X_center[i].toarray()
        simval = spatial.distance.cosine(d_vec[0], cluster_centers_center[0,:])
        if math.isnan(simval):
            values.append(0)
        else:
            values.append(1 - simval)
    if len(values) == 0:
        cohesion_score = 0
    else:
        # avg distance of all members
        cohesion_score = sum(values) / len(values)
    return cohesion_score

# centorid calcuation
def cluster_center_sent(clustering, cluster_set_by_sid, data):
    '''
    :: CLUSTER CENTRE ::
    Calculates cluster centre using model capability to return cluster centre values, and calculates closest two members.
    '''
    document_center = []
    sid_mapping_center = {}
    sid_tmp = 0
    sent_num_cid = len(cluster_set_by_sid)

    dup_sid_set = []
    for sid in cluster_set_by_sid:
        sid_mapping_center[sid_tmp] = sid
        if len(data[sid]['sent_lemma']) > 0:
            dup_sid_set.append(sid)
        document_center.append(data[sid]['sent_lemma'])
        sid_tmp +=1

    if len(dup_sid_set) == 1:
        seedid1 = list(dup_sid_set)[0]
        return seedid1, seedid1

    # clustering one cluster members with k=1 to find relatvie distances
    model_center, cluster_labels_, cluster_centers_center, X_center = clustering.run_doc_clustering(document_center, 1)

    DISTMIN = -9999

    values = []
    values_sid_set = []
    # remove duplicated ones
    for i in range(0, sent_num_cid):
        sid_data = sid_mapping_center[i]
        if data[sid_data]["count"] > 0:
            d_vec = X_center[i].toarray()
            # values.append(spatial.distance.euclidean(d_vec, cluster_centers_center[0,:]))
            if math.isnan(spatial.distance.cosine(d_vec[0], cluster_centers_center[0,:])):
                values.append(1)
            else:
                values.append(1 - spatial.distance.cosine(d_vec[0], cluster_centers_center[0,:]))
            # score = 1 - spatial.distance.cosine(d_vec, cluster_centers[int(cid),:]) #mainid
            values_sid_set.append(sid_data)

    seedid1_tmp = values.index(max(values))
    min_dist1 = values[seedid1_tmp]
    values[seedid1_tmp] = DISTMIN
    seedid1 = values_sid_set[ seedid1_tmp ]

    seedid2_tmp = values.index(max(values))
    min_dist2 = values[seedid2_tmp]
    seedid2 = values_sid_set[ seedid2_tmp ]
    values[seedid1_tmp] = min_dist1

    return seedid1, seedid2, values, values_sid_set






####################################################################################
#  Clustering Algorithm
####################################################################################

class doc_clustering():

    def __init__(self, vectorizer, config):
        
        # :: clustering config ::
        self.clustering_type = "noun"
        self.cluster_k = config.get('kmeans_k')
        self.kmeans_rate = config.get('kmeans_rate')
        self.kmeans_seed_init = config.get('kmeans_seedInit')
        self.kmeans_maxiter = config.get('kmeans_maxIter')
        self.cluster_length = config.get('kmeans_clusterLen')
        self.cohesion_threshold = config.get('kmeans_cohesion_threshold')
        self.vectorizer = vectorizer.lower().strip()
        if self.vectorizer not in ['tfidf', 'bert', 'word2vec', 'doc2vec']:
            raise Exception("Invalid vectorizer passed!")
        self.vectorizer_model_fp = config['vectorizer_model_fp'][self.vectorizer]
        
        # :: resource config ::
        self.stopwd_file = os.path.join(config.get('nlp_resources_fp'), "stopwords.txt")
        self.spacy_model = nlp
        self.stopwords = []
        self.test_mode = "no"
        
        # :: CUSTOM DOMAIN SYNONYMS ::
        self.synonym_file = os.path.join(config.get('nlp_resources_fp'), "synonyms_noun_verb.txt")
        self.use_custom_synonyms = True if os.path.exists(self.synonym_file) else False
        self.synonym_dict = {}
        print("Custom Synonyms Mapping (on/off): ", self.use_custom_synonyms)
        
        self.load_resources()
        return

    def load_resources(self):
        # load stopwords
        with io.open(self.stopwd_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords = [x.rstrip() for x in f.readlines()]
        
        ####################################################################
        # load CUSTOM DOMAIN SYNONYMS
        ####################################################################
        if self.use_custom_synonyms:
            synonym_list = []
            with io.open(self.synonym_file, 'r', encoding='utf-8', errors='ignore') as f:
                synonym_list = [x.rstrip() for x in f.readlines()]
                for line in synonym_list:
                    if len(line) < 4 or re.search("^#", line):
                        continue
                    syn_words = line.lower().split("\t")
                    pos = syn_words[0]
                    hwd = syn_words[1]
                    syn_words.pop(0)
                    syn_words.pop(0)
                    for s in syn_words:
                        self.synonym_dict.setdefault(s, {})[pos] = hwd
        else:
            pass
        ####################################################################
        return

    def get_pos_list(self, results):
        # POS Annotation using Spacy
        pos_ans = []
        word_list = []
        lemma_list = []
        ne_list = []
        start_end_list = []
        indices = results['sentences']
        for line in indices:
            tokens = line['tokens']
            for token in tokens:
                pos_syn = ""
                pos = token['pos'].lower()
                pos_ans.append(token['pos'])
                word_list.append(token['word'])
                lemma = token['lemma'].lower()
                if lemma in self.stopwords and lemma not in ['want', 'against', 'further', 'online', 'same', 'under',
                                                             'what', 'want', 'when', 'own', ''] \
                        or lemma in [":", "-lrb-", "-rrb-", "-lsb-", "-rsb-", "\\", '-pron-', '_', 'card num', '"'] \
                        or pos == ":" or pos == "." or re.search('^([\W]*)$]', lemma) or len(lemma) >= 30:
                    continue
                if re.search('^nn', pos):
                    pos_syn = 'noun'
                elif re.search('^v', pos):
                    pos_syn = 'verb'
                elif re.search('^adj', pos):
                    pos_syn = 'adj'
                
                ####################################################################
                # use CUSTOM DOMAIN SYNONYMS
                # replace lemma with primary synonym key!
                ####################################################################
                if self.use_custom_synonyms:
                    if lemma in list(self.synonym_dict.keys()) and pos_syn in list(self.synonym_dict[lemma].keys()):
                        lemma = self.synonym_dict[lemma][pos_syn]
                ####################################################################
                
                lemma_list.append(lemma)
                ne_list.append(token['ner'])
                start_end_list.append(str(token['characterOffsetBegin']) + "_" + str(token['characterOffsetEnd']))

        if len(lemma_list) == 0:
            sent_lemma = "NO_NP_KEYS"
        else:
            sent_lemma = " ".join(lemma_list)
        return " ".join(word_list), sent_lemma, " ", " "

    def get_sent_lemma(self, sent):
        # allowed spacy attributes
        props_str = [{'value': 'false', 'key': 'enforceRequirements'},
                     {'value': 'json', 'key': 'outputFormat'},
                     {'value': 'normalizeSpace=false, strictTreebank3=true', 'key': 'tokenize.options'},
                     {'value': 'tokenize,ssplit,pos,lemma,ner', 'key': 'annotators'},
                     {'value': 'true', 'key': 'ssplit.eolonly'}]
        input_json = {'text': sent, 'props': props_str}
        results = cluster_preprocessing_run_pipeline(input_json, self.spacy_model)
        return self.get_pos_list(results)


    def run_doc_clustering(self, documents, k_val, run_analysis=False):

        max_iter = self.kmeans_maxiter
        ninit = self.kmeans_seed_init
        r_num = len(documents) - 1

        ################################################################################
        # Vectorization Engine
        ################################################################################
        if self.vectorizer=="tfidf":
            print("Vectorizing using...: tfidf records: ", len(documents))
            vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1, 1))
            X = vectorizer.fit_transform(documents)
        
        elif self.vectorizer=="word2vec":
            print("Vectorizing using...: Word2Vec records: ", len(documents))
            raise Exception("Word2vec is not yet defined! Define first in 'Word2vec_Vecotrizer' !")
            
        elif self.vectorizer=="doc2vec":
            print("Vectorizing using...: Doc2Vec records: ", len(documents))
            raise Exception("Doc2vec is not yet defined! Define first in 'Doc2vec_Vecotrizer' !")
                
        elif self.vectorizer=="bert":
            if not self.vectorizer_model_fp:
                raise Exception("For using sBERT, you must define model-path in the config!")
            print("Vectorizing using...: sBert records: ", len(documents))
            tokenizer = AutoTokenizer.from_pretrained(self.vectorizer_model_fp)
            model_bert = AutoModel.from_pretrained(self.vectorizer_model_fp)
            vectorizer = Bert_Vectorizer(tokenizer, model_bert, embedding_func=lambda x: x[0][:, 0, :].squeeze())
            X = vectorizer.transform(documents, formatt='csr')
        
        else:
            raise Exception("Please specify a vectorizer from available: ['tfidf', 'word2vec', 'doc2vec', 'bert']")
        print("Vectorization complete!")
        ################################################################################

        # Value of "K" clusters
        true_k = k_val = int(k_val)

        ## --> Heusristics <--
        if self.clustering_type == "fixed": true_k = 2000
        elif self.clustering_type == "noun": true_k = int(np.sqrt(len(documents) / 2)) * self.kmeans_rate
        elif self.clustering_type == "verb": true_k = int(np.sqrt(len(documents) / 2))
        else: print("Please set up clustering type! Options=['fixed', 'noun', 'verb']")
        print("Heusristic value of K: true_k=", true_k)

        ## --> Elbow method (silhouette) <--
        if run_analysis and k_val==0 and true_k>=25:
            search_range = range(true_k-20, true_k+20, 1)
            sil_score_max = -1 # minimum possible score
            for n_clusters in search_range:
                model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=2000, random_state=r_num, n_init=ninit)
                labels = model.fit_predict(X)
                sil_score = silhouette_score(X, labels)
                print("The average silhouette score for %i clusters is %0.2f" %(n_clusters, sil_score))
                if sil_score > sil_score_max:
                    true_k, sil_score_max = n_clusters, sil_score
            print("Silhouette Analysis completed! Best_n_clusters K=", true_k)

        if k_val > 0 and self.clustering_type == "noun":
            if len(documents) / 2 < k_val:
                debug_msg = "Number of input lines: {}\n" + \
                             "K value provided: {}\n" + \
                             "Suggested value of K for the 1st level clustering is less than input-lines divided by 2\n" + \
                             "Please try again!  Heusristic value of K (if line number > 100): {}"\
                            .format(len(documents), k_val, int(np.sqrt(len(documents)/2))*self.kmeans_rate)
                raise Exception(debug_msg)
            else:
                true_k = k_val


        # Final Clustering
        if true_k == 0: true_k = 1
        model = KMeans(n_clusters=true_k, init='k-means++', max_iter=2000, random_state=r_num, n_init=ninit)
        model.fit(X)

        cluster_labels = model.labels_
        cluster_centers = model.cluster_centers_
        print("Clustering is complete.")

        return model, cluster_labels, cluster_centers, X



## SAMPLE EXECUTION

# clustering_obj = doc_clustering(nlp_resources_fp, spacy_model, vectorize=['tfidf', 'bert'])
# sent_tokens, sent_lemma, sent_verb, sent_dup = clustering_obj.get_sent_lemma(input_sentence)
# model, cluster_labels, cluster_centers, X_vectorized = clustering.run_doc_clustering(sentences, cluster_k, run_analysis=False)

### Clustering Pipeline

In [398]:
def run_clustering(df_cluster, config, preprocess=False, vectorizer='bert'):
    
    start = time.time()
    
    # Config parameters to be used
    col_id = config.get('colname_id')
    col_clean_txt = config.get('colname_clean_txt')
    col_dup_sim_id = config.get('colname_dup_similar_id_col')
    col_cluster_id = config.get('colname_cluster_id')
    
    ################################################################################################
    # 1. INIT DOC CLUSTERING CLASS OBJECT
    ################################################################################################
    clustering = doc_clustering(vectorizer, config)
    
    ################################################################################################
    # 2. PREPARE DATA (Optional: preprocessing)
    ################################################################################################
    print("Uploading input file and lemmatizing...")
    # create 'data' dict to store every info regarding each sentence
    data = OrderedDict()
    for index, row in df_cluster.iterrows():
        uid, input_text, dup_similar_idx = row[col_id], row[col_clean_txt], row[col_dup_sim_id]
        if len(input_text.strip())==0: 
            input_text = 'NONE'
        original_text = input_text.strip()
        
        if preprocess:
            # run spacy complete pipeline
            sent_tokens, sent_lemma, sent_verb, sent_dup = clustering.get_sent_lemma(original_text)
            input_text = str(sent_lemma)

        # features
        upper_cnt = sum(1 for c in input_text if c.isupper())
        case_ratio = float(upper_cnt) / float(len(input_text))
        data.setdefault(index, {})["sent_lemma"] = str(input_text)
        # :not: data.setdefault(index, {})["sent_noun"] = str(sent_lemma)
        # :not: data.setdefault(index, {})["sent_verb"] = str(sent_verb)
        # :not: data.setdefault(index, {})["sent_tokens"] = sent_tokens
        data.setdefault(index, {})["count"] = len(dup_similar_idx)
        data.setdefault(index, {})["case_ratio"] = case_ratio
        data.setdefault(index, {})["clusterid"] = "no"
        data.setdefault(index, {})["keep"] = "yes"
        data.setdefault(index, {})["select"] = "no"
        data.setdefault(index, {})["index"] = index
        data.setdefault(index, {})["unique_id"] = uid
        data.setdefault(index, {})["sent_raw"] = original_text

    ################################################################################################
    # 3. PREPARE CORPUS
    ################################################################################################
    print("Collecting text corpus")
    documents = []
    main_docid_id_map = {}  # id in documents vs real id in input
    doc_id = 0
    for id, x in list(data.items()):
        if data[id]["count"] >= 0:
            main_docid_id_map[doc_id] = id
            documents.append(data[id]['sent_lemma'])
            doc_id += 1
    if len(documents) < 50:
        raise Exception("The file contains sentences less than 50! Please check  and run again!")
    else:
        print('>> Total input sentences = ', len(documents))

    ################################################################################################
    # 4. 1st LEVEL CLUSTERING
    ################################################################################################
    print("\nPerforming 1st level clustering")

    # init, using heursitics to find value of k
    cluster_k = 0

    clustering.clustering_type = "noun"
    model, cluster_labels, cluster_centers, X = clustering.run_doc_clustering(documents, cluster_k, run_analysis=False)
    
    # cluster_id : question id
    c_s_list = [(y,x) for x,y in enumerate(cluster_labels)]

    ################################################################################################
    # 5. COLLECT CLUSTERED IDs
    ################################################################################################
    print("\nCollecting clustered ids")

    # add first level cluster-id to data dict --> cluster_id : [Q_id1, Q_id2, ..., Q_idn]
    cid_sid_counter = {}
    for cid, sid in c_s_list:
        main_sid = main_docid_id_map[sid]
        data.setdefault(main_sid, {})["clusterid"] = str(format(cid, "05d"))
        cid_sid_counter.setdefault(cid, []).append(main_sid)

    ################################################################################################
    # 6. CHECK REQUIREMENT FOR 2nd LEVEL CLUSTERING
    ################################################################################################
    print("\nFine-tuning clusters: Checking if 2nd level clustering is required...")

    clustering.clustering_type = "noun"
    cohesionThreshold = clustering.cohesion_threshold
    clusteringMaxLen = clustering.cluster_length
    print('cohesionThreshold is set to: ', cohesionThreshold)

    for cid in sorted(cid_sid_counter.keys()):

        # --> documents under cluster_id = 'cid'
        documents_sub =[]
        for sid in cid_sid_counter[cid]:
            documents_sub.append(data[sid]['sent_lemma'])

        if len(documents_sub) > clusteringMaxLen:
            print(1, documents_sub)
            cohesion_score = cluster_cohesion_cosine(clustering, cid_sid_counter[cid], data)
            print('cid -- cohesion_score = ', cid, cohesion_score)

            # RE-CLUSTERING   (to use cohesion value for basis of reclustering)
            if cohesion_score < cohesionThreshold:
                print('**re-clustering: cluster_id={}; member count={}; cohesion={}'.format(cid, len(documents_sub), cohesion_score))
                model_sub, cluster_labels_sub, cluster_centre_sub, X_sub = clustering.run_doc_clustering(documents_sub, cluster_k)
            else:
                continue
        else:
            continue

        # :: ONLY if 2nd clustering was performed ::
        c_s_list_sub = [(y,x) for x,y in enumerate(cluster_labels_sub)]

        # add 2nd level cluster-id to data dict
        # --> cluster_id : [Q_id1, Q_id2, ..., Q_idn]
        for cid_sub,sid_sub in c_s_list_sub:
            sid_index = cid_sid_counter[cid][sid_sub]
            data[sid_index]['clusterid'] = data[sid_index]['clusterid'] + "_"+str(format(cid_sub, "03d"))

    # reset to first level
    clustering.clustering_type = "noun"
    cluster_set_by_sid = {}
    for id, x in list(data.items()):
        if data[id]['clusterid'] != "no":
            cid = data[id]['clusterid']
            cluster_set_by_sid.setdefault(cid, []).append(id)

    ################################################################################################
    # 7. FILTER, FIND INSIGHTS & SAVE
    ################################################################################################
    print("\nFiltering, getting insights and saving output")

    # SAVE OUTPUT (CSV)
    output = []
    output.append("{}\t{}\tText\tSeed_Q1\tSeed_Q2\tDist_to_Center\tCohesion\r\n".format(col_cluster_id, col_id))

    T = 0.0  # recommended: 0.4
    cid_index = 0
    cluster_set_by_sid_filtered = {}
    cluster_set_by_sid_filtered_coh_score = {}
    for cid in list(cluster_set_by_sid.keys()):
        if len(cluster_set_by_sid[cid]) > 1:
            cohesion_score = cluster_cohesion_cosine(clustering, cluster_set_by_sid[cid], data)
            if cohesion_score < T:
                for s in cluster_set_by_sid[cid]:
                    cluster_set_by_sid_filtered.setdefault(cid_index,[]).append(s)
                    cid_index += 1
            else:
                cluster_set_by_sid_filtered[cid_index] = cluster_set_by_sid[cid]
                cluster_set_by_sid_filtered_coh_score[cid_index] = cohesion_score
                cid_index += 1
        else:
            cluster_set_by_sid_filtered[cid_index] = cluster_set_by_sid[cid]
            cluster_set_by_sid_filtered_coh_score[cid_index] = 0
            cid_index += 1

    output_index=0
    for cid in list(cluster_set_by_sid_filtered.keys()):
        sent_num_cid = len(cluster_set_by_sid_filtered[cid])
        values = []
        values_sid = []

        if sent_num_cid > 0 and len(cluster_set_by_sid_filtered[cid]) > 1:
            seedid1,seedid2, values, values_sid = cluster_center_sent(clustering, cluster_set_by_sid_filtered[cid], data)
        else:
            seedid1 = cluster_set_by_sid_filtered[cid][0]
            seedid2 = cluster_set_by_sid_filtered[cid][0]
            values.append(1)
            values_sid.append(seedid1)

        seed_q1 = data[seedid1]["sent_raw"]
        seed_q2 = data[seedid2]["sent_raw"]

        sent_count = 0
        for sid in cluster_set_by_sid_filtered[cid]:
            sent_count += data[sid]["count"]
        ch_score= cluster_set_by_sid_filtered_coh_score[cid]

        for sid in cluster_set_by_sid_filtered[cid]:
            dup_count = data[sid]['count']
            if dup_count == 0:
                continue
            else:
                sid_norm = data[sid]["index"]
                sent = data[sid_norm]['sent_raw']
            sim_index = values_sid.index(sid)
            dist_to_center = 1.0 - values[sim_index]
            unique_id = data[sid]["unique_id"]

            # cid, sid, unique_id, sent, seed_q1, seed_q2, dist_to_center, cohesion_score
            output.append("%d\t%s\t%s\t%s\t%s\t%5.3f\t%5.3f\r\n" % (cid, unique_id, sent, seed_q1, seed_q2, dist_to_center, ch_score))

        cid_index += 1

    clustered_output = pd.read_csv(StringIO("\n".join(output)), sep="\t").reset_index(drop=True)
    print("\nClustering process finished! Time taken(s) =", (time.time()-start))

    return clustered_output



## SAMPLE EXECUTION

# clustered_output = run_clustering(df_cluster, spacy_model, preprocess=False, vectorizer=['tfidf', 'bert'])

##### Run

In [399]:
# EXECUTE CLUSTERING

clustered_output = run_clustering(df_cluster, config, preprocess=False, vectorizer='tfidf')

Custom Synonyms Mapping (on/off):  False
Uploading input file and lemmatizing...
Collecting text corpus
>> Total input sentences =  456

Performing 1st level clustering
Vectorizing using...: tfidf records:  456
Vectorization complete!
Heusristic value of K: true_k= 75
Clustering is complete.

Collecting clustered ids

Fine-tuning clusters: Checking if 2nd level clustering is required...
cohesionThreshold is set to:  0.7

Filtering, getting insights and saving output
Vectorizing using...: tfidf records:  7
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  6
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  5
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  5
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  12
Vect

Clustering is complete.
Vectorizing using...: tfidf records:  2
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  5
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  4
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  7
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  8
Vectorization complete!
Heusristic value of K: true_k= 10
Clustering is complete.
Vectorizing using...: tfidf records:  12
Vectorization complete!
Heusristic value of K: true_k= 10
Clustering is complete.
Vectorizing using...: tfidf records:  2
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  5
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vecto

Clustering is complete.
Vectorizing using...: tfidf records:  4
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  9
Vectorization complete!
Heusristic value of K: true_k= 10
Clustering is complete.
Vectorizing using...: tfidf records:  9
Vectorization complete!
Heusristic value of K: true_k= 10
Clustering is complete.
Vectorizing using...: tfidf records:  5
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  4
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  2
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  2
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vectorizing using...: tfidf records:  5
Vectorization complete!
Heusristic value of K: true_k= 5
Clustering is complete.
Vector

- Involves 2 layers of clustering, one initial with heruristic value of k and 2nd for fine-tuning any cluster having more than 'CLUSTER_LEN'members.

In [400]:
clustered_output

Unnamed: 0,cluster_id,UID,Text,Seed_Q1,Seed_Q2,Dist_to_Center,Cohesion
0,0,uidx_0,campus dining location close today jan 1 happy...,dining update seec cafe starbucks close begin ...,dining update begin today march 16 follow dini...,0.531,0.502
1,0,uidx_235,move teach remote work begin monday begin,dining update seec cafe starbucks close begin ...,dining update begin today march 16 follow dini...,0.651,0.502
2,0,uidx_293,dining update order maintain health safety our...,dining update seec cafe starbucks close begin ...,dining update begin today march 16 follow dini...,0.491,0.502
3,0,uidx_311,dining update begin today march 16 follow dini...,dining update seec cafe starbucks close begin ...,dining update begin today march 16 follow dini...,0.416,0.502
4,0,uidx_323,dining update seec cafe starbucks close begin ...,dining update seec cafe starbucks close begin ...,dining update begin today march 16 follow dini...,0.371,0.502
...,...,...,...,...,...,...,...
451,73,uidx_309,stay date late dining option campus,stay date late dining option campus,sure stay date campus dining option via,0.226,0.616
452,73,uidx_312,ve enter next phase campus effectively close r...,stay date late dining option campus,sure stay date campus dining option via,0.490,0.616
453,73,uidx_365,colorado find 2nd well prepared state nation c...,stay date late dining option campus,sure stay date campus dining option via,0.538,0.616
454,74,uidx_466,stand colleague americas lead research univers...,stand colleague americas lead research univers...,see woman golf coach anne kelly stand colleagu...,0.065,0.935


## 10. Create Final Matrix: Merge {original data + df_cluster + clustered_output}

In [408]:
def create_final_output(df, df_cluster, clustered_output):
    col_id = config['colname_id']
    col_dup_sim_id = config['colname_dup_similar_id_col']
    col_cluster_freq = config['colname_freq']
    col_cluster_coverage = config['colname_coverage']
    col_dup_cluster_ids = config['colname_duplicate_cluster_id']
    col_cluster_id = config['colname_cluster_id']
    
    # 1. MERGE 'clustered_output' WITH 'df_cluster' on UNIQUE_ID TO CREATE a 'temp'
    #    --> maps 'dup_similar_idx' with 'cluster_id' i.e. {dup_similar_idx: Cluster_Info}
    #
    temp = df_cluster[[col_id, col_dup_sim_id]]\
             .merge(clustered_output[[col_id, col_cluster_id, 'Seed_Q1','Seed_Q2','Dist_to_Center','Cohesion']],
                    on=col_id, how='left')

    # 2. MERGE 'temp' WITH ORIGNAL DATA (without any drops) 'df'
    #    --> maps cluster_info to orignal records using on='dup_similar_idx'
    #
    OUTPUT_DF = df.merge(temp[[col_dup_sim_id,col_cluster_id,'Seed_Q1', 'Seed_Q2','Dist_to_Center','Cohesion']],
                         on=col_dup_sim_id, how='left').drop(columns=[col_dup_cluster_ids])

    # 3. GENERATE INSIGHTS (Frequency, Coverage)
    #
    #  Frequency = how frequent is the Text_i across all Texts
    #  Coverage  = % of frequency, i.e. Count of Text_i(including duplicates) / total count of all Texts
    #
    OUTPUT_DF[col_cluster_id] = OUTPUT_DF[col_cluster_id].fillna(-1).astype(int)
    cid_count = Counter(OUTPUT_DF[col_cluster_id])
    OUTPUT_DF[col_cluster_freq] = OUTPUT_DF[col_cluster_id].apply(lambda cid: cid_count.get(cid, 0))
    OUTPUT_DF[col_cluster_coverage] = OUTPUT_DF[col_cluster_freq].apply(lambda x: int(x)*100.0/sum(cid_count.values()))
    print("Total utterances:", len(df))
    print("Clusters generated:", len(cid_count))
    OUTPUT_DF.to_csv("")
    return OUTPUT_DF

In [414]:
final_df = create_final_output(df, df_cluster, clustered_output)
final_df.to_csv(os.path.join(data_dir, "tweets_final_output.csv"), index=False)
final_df.head(3)

Total utterances: 500
Clusters generated: 75


Unnamed: 0,UID,tweet,label,Processed_tweet,lang_mask,dup_idx,similar_idx,dup_similar_idx,cluster_id,Seed_Q1,Seed_Q2,Dist_to_Center,Cohesion,memberCount,coverage
0,uidx_0,"All campus dining locations are closed today, ...",cu_others,campus dining location close today jan 1 happy...,en,['uidx_0'],['uidx_0'],"('uidx_0',)",0,dining update seec cafe starbucks close begin ...,dining update begin today march 16 follow dini...,0.531,0.502,7,1.4
1,uidx_1,"What are your campus dining options today, Jan...",cu_others,what your campus dining option today jan 2 alf...,en,['uidx_1'],"['uidx_1', 'uidx_3']","('uidx_1', 'uidx_3')",1,dining update umc market starbucks alferd pack...,dining update alfred packer grill starbucks um...,0.461,0.55,7,1.4
2,uidx_2,#FPGA Design for #Embedded #Systems\n\n#SoC #V...,cu_online,fpga design embed system soc verilog vlsi asic...,en,['uidx_2'],['uidx_2'],"('uidx_2',)",2,conjunction team visit samuel jackman prescod ...,buff take barbados team feature alongside thei...,0.54,0.492,5,1.0


## 11. Metrics and insights

In [415]:
# # Summarized cluster Output showing N members in a list
# DISPLAY_CLUSTER_MEMBER_COUNT = 30
# visual = OUTPUT_DF.sort_values(by=[source_coverage_col, 'Dist_to_Center'], ascending=[False, True]).reset_index(drop=True)
# visual_at_glance = visual.groupby([source_clustering_col]).agg({source_text_col: lambda x: list(x)[:DISPLAY_CLUSTER_MEMBER_COUNT], 'Seed_Q1': 'max', source_freq_col: 'max', source_coverage_col: 'max', 'Cohesion': 'max'}).reset_index()
# visual_at_glance.rename(columns={source_text_col: f'List of {DISPLAY_CLUSTER_MEMBER_COUNT} {source_text_col}', 'Seed_Q1': "Cluster Representative"}, inplace=True)

# # Top N Clusters
# DISPLAY_N_CLUSTERS = 20
# top_N_cluster_ids = list(visual[source_clustering_col].unique()[:DISPLAY_N_CLUSTERS])
# top_N_clusters = visual_at_glance[visual_at_glance[source_clustering_col].isin(top_N_cluster_ids)].sort_values(by=[source_coverage_col], ascending=False).reset_index(drop=True)
# top_N_clusters.rename(columns={source_text_col: f'List of {DISPLAY_CLUSTER_MEMBER_COUNT} {source_text_col}', 'Seed_Q1': "Cluster Representative"}, inplace=True)

In [416]:
# # Prepare Top N Clusters to be visualized!

# top_N_clusters['DESC'] = top_N_clusters.apply(lambda row: "Cluster Description:<br>Cluster ID = {}<br><br>Few Members = {}<br>Total Member count = {}<br>Coverage = {}<br>Cohesion/Tightness = {}".format(row[source_clustering_col],
#                                                                                                         row[f'List of {DISPLAY_CLUSTER_MEMBER_COUNT} {source_text_col}'][:5],
#                                                                                                         row[source_freq_col],
#                                                                                                         f'{row[source_coverage_col]:.3f}',
#                                                                                                         f'{row["Cohesion"]:.3f}'), axis=1)
# import random
# top_N_clusters['COLOR'] = top_N_clusters[source_clustering_col].apply(lambda x: "#%02X%02X%02X" % (random.randint(0,255), random.randint(0,255), random.randint(0,255)))

In [417]:
# fig = go.Figure(data=go.Scattergl(

#     x = top_N_clusters[source_coverage_col],
#     y = top_N_clusters['Cohesion'],

#     mode="markers",
#     marker=dict(
#         size=top_N_clusters[source_coverage_col]/0.1,
#         color=top_N_clusters['COLOR'],
#         colorscale='Viridis',
#         colorbar=dict(title='Cluster_score'),
#         line_width=1,
#         showscale=False),

#     # Hover
#     text = top_N_clusters[source_coverage_col],  # Mention Hover values here & formatting in hovertemplate
#     hoverlabel=dict(bgcolor="black", font_size=12, font_family="Roboto"),
#     hovertemplate=top_N_clusters['DESC']
    
# #     hovertemplate = "<b>Cluster ID = %{text}</b><br>" +
# #                     "<i>Few Members:</i>: %{members}<br>" +
# #                     "<i>Cohesion</i>: %{y:.2f}<br>" +
# #                     "<i>Cohesion</i>: %{y:.2f}<br>" +
# #                     "<i>Coverage</i>: %{x:.2f}<br>"
# ))

# fig.update_layout(autosize=True,
#                    title_text='Cluster map showing weighted-mean of Cohesion-Coverage values',
#                    xaxis=dict(title="Coverage %"),
#                    yaxis=dict(title="Cohesion/Tightness"),
#                    showlegend=False) #width=900, height=700

# fig.show()
# # save
# offline.plot(fig, filename=os.path.join(output_fp, "Plot_CohesionVsCoverage%_for_topN_cluster.html"))

In [418]:
# fig = px.pie(top_N_clusters, values='COVERAGE', names='DESC')
# fig.update_traces(textposition='outside')
# fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
# fig.update_layout(autosize=True,
#                   title_text=f'Relative Coverage % for top {DISPLAY_N_CLUSTERS} clusters',
#                   xaxis=dict(title="Coverage %"),
#                   yaxis=dict(title="Cohesion/Tightness"),
#                   showlegend=False)
# fig.show()
# # save
# offline.plot(fig, filename=os.path.join(output_fp, "Relative_Coverage_%_for_topN_cluster.html"))

In [419]:
# fig = px.bar(top_N_clusters, x=top_N_clusters['Cluster_ID'].apply(lambda x: f"Cluster_{x}"), y='COVERAGE', color='COLOR', 
#              hover_data=dict(hovertemplate=top_N_clusters['DESC']))

# fig.update_layout(title_text='Cluster map showing different clusters with their relative coverage %',
#                   xaxis=dict(title="Cluster IDs"),
#                   yaxis=dict(title="Coverage %"),
#                   showlegend=False)

# fig.show()
# # save
# offline.plot(fig, filename=os.path.join(output_fp, "Coverage%_for_topN_cluster.html"))

----