# NLP Cleansing, Vectorization and Similarity Computation

- Timestamp: July, 2021
- Author: Pranjal Pathak

## Contents

1. Preprocessing
2. Vectorization using sBert
3. Text Similarity
4. Example

**Techniques tested:**

Standard Methods

------------------------------------------------------------------------------------------
1. Bilingual Evaluation Understudy (BLEU)
2. Levenshtein Distance
3. TFIDF + Euclidean Distance
4. TFIDF + Cosine Distance
5. TFIDF + Jaccard Distance
6. Embeddings + Cosine Distance
7. Embeddings + Smooth Inverse Frequency (SIF) + Cosine Distance
8. Spacy Vectorizer + Smooth Inverse Frequency (SIF) + Cosine Distance
9. Embeddings + Word Movers Distance (WMD)

Advanced Methods

------------------------------------------------------------------------------------------
10. LDA + Jensen-Shannon distance
11. Trained Attention LSTM Inception AutoEncoder Network Embeddings + Cosine Distance (**3rd BEST**)
12. Finetuned Universal Sentence Encoder(USE) + Cosine Distance (**2nd BEST**)
13. Fine-tuned Ultra-Fast Sentence Bert (sBERT) + Cosine Distance (**BEST**)
14. Siamese Deep Neural Network (Pending)


Future Scope

------------------------------------------------------------------------------------------

15. Siamese Manhattan LSTM + Manhattan Similarity
16. ELMo Embeddings + Cosine Similarity
17. BERT Embeddings + Cosine Similarity
18. FastText embeddings (trained on 3DR) + Attention LSTM AutoEncoder (trained on 3DR) + Cosine Similarity
19. FastText embeddings (trained on 3DR) + Attention LSTM AutoEncoder (trained on Utterances) + Cosine Similarity
20. FastText embeddings (trained on Utterances) + Attention LSTM AutoEncoder (trained on 3DR) + Cosine Similarity
21. FastText embeddings (trained on Utterances) + Attention LSTM AutoEncoder (trained on Utterances) + Cosine Similarity

## `Instructions`

1. Get Python >= 3.6.0
2. Create a virtual env and install requirements.txt
3. Get Started!

## Imports

In [14]:
'''Update code from Python 3.6.10 to a stable Kernel Python Version 3.8.0 '''

# Standard libs
import os
import sys
import json
import warnings
import re
import io
from io import StringIO
import inspect
import shutil
import ast
import string
import time
import pickle
import glob
import traceback
import multiprocessing
import requests
import logging
import math
import pytz
from itertools import chain
from string import Template
from datetime import datetime, timedelta
from dateutil import parser
import base64
from collections import defaultdict, Counter, OrderedDict
from contextlib import contextmanager
import unicodedata
from functools import reduce
import itertools
import tempfile
from typing import Any, Dict, List, Callable, Optional, Tuple, NamedTuple, Union
from functools import wraps

# graph
import networkx as nx

# Required pkgs
import numpy as np
from numpy import array, argmax
import pandas as pd
import ntpath
import tqdm

# General text correction - fit text for you (ftfy) and others
import ftfy
from fuzzywuzzy import fuzz
#from wordcloud import WordCloud
from spellchecker import SpellChecker

# imbalanced-learn
from imblearn.over_sampling import SMOTE, SVMSMOTE, ADASYN

# scikit-learn
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, jaccard_score, silhouette_score, homogeneity_score, calinski_harabasz_score
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.base import BaseEstimator, TransformerMixin

# scipy
from scipy import spatial, sparse
from scipy.sparse import coo_matrix, vstack, hstack
from scipy.spatial.distance import euclidean, jensenshannon, cosine, cdist
from scipy.io import mmwrite, mmread
from scipy.stats import entropy
from scipy.cluster.hierarchy import dendrogram, ward, fcluster
import scipy.cluster.hierarchy as sch
from scipy.sparse.csr import csr_matrix
from scipy.sparse.lil import lil_matrix
from scipy.sparse.csgraph import connected_components

# sparse_dot_topn: matrix multiplier
from sparse_dot_topn import awesome_cossim_topn
import sparse_dot_topn.sparse_dot_topn as ct

# Gensim
import gensim
from gensim.models import Phrases, Word2Vec, KeyedVectors, FastText, LdaModel
from gensim import utils
from gensim.utils import simple_preprocess
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api
from gensim import models, corpora, similarities

# NLTK
import nltk
#nltk_model_data_path = "/someppath/"
#nltk.data.path.append(nltk_model_data_path)
from nltk import FreqDist, tokenize, sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords, PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import *
from nltk.translate.bleu_score import sentence_bleu
print("NLTK loaded.")

# Spacy
import spacy
# spacy_model_data_path = "/Users/pranjalpathak/opt/anaconda3/envs/Python_3.6/lib/python3.6/site-packages/en_core_web_lg/en_core_web_lg-2.2.5"
nlp = spacy.load('en_core_web_lg')  # disabling: nlp = spacy.load(spacy_data_path, disable=['ner'])
from spacy import displacy
from spacy.matcher import Matcher
from spacy.lang.en import English
print("Spacy loaded.")

# TF & Keras
import tensorflow as tf
from keras import backend as K
from keras.layers import *
from tensorflow.keras.layers import Layer, InputSpec, BatchNormalization, Embedding, LSTM, Dense, Activation
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import pad_sequences, CustomObjectScope
from keras.utils.np_utils import to_categorical
from keras import initializers as initializers, regularizers, constraints, optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from keras.models import Sequential, Model, load_model
import tensorflow_hub as hub
print("TensorFlow loaded.")

# Pytorch
import torch
from torch import optim, nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
from transformers import pipeline
from transformers import AutoModel
print("PyTorch loaded.")

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly import offline
%matplotlib inline

# Theme settings
pd.set_option("display.max_columns", 80)
sns.set_context('talk')
sns.set(rc={'figure.figsize':(15,10)})
sns.set_style("darkgrid")
warnings.filterwarnings('ignore')

  from scipy.sparse.csr import csr_matrix
  from scipy.sparse.lil import lil_matrix


NLTK loaded.
Spacy loaded.
TensorFlow loaded.
PyTorch loaded.


## Directory Setup

In [35]:
# common NLP resources
resources_dir_path = "./data/resources/"

# embedding models
pretrained_glove_fp = "./models/pretrained/glove/glove.6b.100d/glove.6B.100d.vec"
pretrained_google_fp = "./models/pretrained/google/GoogleNews-vectors-negative300.bin"
pretrained_fasttext_fp = "./models/pretrained/fasttext/cc.en.300.bin"
trained_word2vec_fp = "./models/trained/word2vec"
trained_doc2vec_fp = "./models/trained/doc2vec"
trained_wmd_fp = "./models/trained/wordmoversdistance"
trained_word2vec_glove_fp = "./models/trained/word2vec_glove"
trained_word2vec_google_fp = "./models/trained/word2vec_google"
trained_fasttext_fp = "./models/trained/fasttext"
tuned_use_fp = "./models/trained/USE-Model-Large"
tuned_sbert_fp = "./models/trained/sentence-transformers-models/all-distilroberta-v1"

## Preprocessing Unit (Unit 1/3)

In [20]:
class preprocessText:
    
    def __init__(self, resources_dir_path, custom_vocab=[], do_lemma=False):
        self.stopwords_file = os.path.join(resources_dir_path, "stopwords.txt")
        self.special_stopwords_file = os.path.join(resources_dir_path, "special_stopwords.txt")
        self.special_characters_file = os.path.join(resources_dir_path, "special_characters.txt")
        self.contractions_file = os.path.join(resources_dir_path, "contractions.json")
        self.chatwords_file = os.path.join(resources_dir_path, "chatwords.txt")
        self.emoticons_file = os.path.join(resources_dir_path, "emoticons.json")
        self.greeting_file = os.path.join(resources_dir_path, "greeting_words.txt")
        self.signature_file = os.path.join(resources_dir_path, "signature_words.txt")
        self.preserve_key = "<$>" # preserve special vocab
        self.vocab_list = custom_vocab
        self.preseve = True if len(custom_vocab) > 0 else False
        self.load_resources()
        self.do_lemma = do_lemma
        return
    
    def load_resources(self):
        
        ### Build Vocab Model --> Words to keep
        self.vocab_list = set(map(str.lower, self.vocab_list))
        self.vocab_dict = {w: self.preserve_key.join(w.split()) for w in self.vocab_list}
        self.re_retain_words = re.compile('|'.join(sorted(map(re.escape, self.vocab_dict), key=len, reverse=True)))
        
        ### Build Stopwords Model --> Words to drop/delete
        with open(self.stopwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords = [x.rstrip() for x in f.readlines()]
        with open(self.special_stopwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords.extend([x.rstrip() for x in f.readlines()])
        with open(self.special_characters_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords.extend([x.rstrip() for x in f.readlines()])
        self.stopwords = list(sorted(set(self.stopwords).difference(self.vocab_list)))

        ### Build Contractions
        with open(self.contractions_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.contractions = dict(json.load(f))
        
        ### Build Chat-words
        with open(self.chatwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.chat_words_map_dict, self.chat_words_list = {}, []
            chat_words = [x.rstrip() for x in f.readlines()]
            for line in chat_words:
                cw = line.split("=")[0]
                cw_expanded = line.split("=")[1]
                self.chat_words_list.append(cw)
                self.chat_words_map_dict[cw] = cw_expanded
            self.chat_words_list = set(self.chat_words_list)
        
        ### Bukd social markups
        # emoticons
        with open(self.emoticons_file, "r") as f:
            self.emoticons = re.compile(u'(' + u'|'.join(k for k in json.load(f)) + u')')
        # emojis
        self.emojis = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        # greeting
        with open(self.greeting_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.greeting_words = [x.rstrip() for x in f.readlines()]
        # signature
        with open(self.signature_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.signature_words = [x.rstrip() for x in f.readlines()]
        # spell-corrector
        self.spell_checker = SpellChecker()   
        return
    
    
    def reserve_keywords_from_cleaning(self, text, reset=False):
        """ 
        Finds common words from a user-provided list of special keywords to preserve them from 
        cleaning steps. Identifies every special keyword and joins them using `self.preserve_key` during the 
        cleaning steps, and later resets it back to original word in the end.
        """
        if reset is False:
            # compile using a dict of words and their expansions, and sub them if found!
            match_and_sub = self.re_retain_words.sub(lambda x: self.vocab_dict[x.string[x.start():x.end()]], text)
            return re.sub(r"([\s\n\t\r]+)", " ", match_and_sub).strip()
        else:
            # reverse the change! - use this at the end of preprocessing
            text = text.replace(self.preserve_key, " ")
            return re.sub(r"([\s\n\t\r]+)", " ", text).strip()


    def basic_clean(self, input_sentences):
        cleaned_sentences = []
        for sent in input_sentences:
            sent = str(sent).strip()
            # FIX text
            sent = ftfy.fix_text(sent)
            # Normalize accented chars
            sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Removing <…> web scrape tags
            sent = re.sub(r"\<(.*?)\>", " ", sent)
            # Expanding contractions using contractions_file
            sent = re.sub(r"(\w+\'\w+)", lambda x: self.contractions.get(x.group().lower(), x.group().lower()), sent)
            # Removing web urls
            sent = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0–9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", sent)
            # Removing date formats
            sent = re.sub(r"(\d{4}\-\d{2}\-\d{2}\s\d{2}\:\d{2}\:\d{2}\s\:)", " ", sent)
            # Removing extra whitespaces
            sent = re.sub(r"([\s\n\t\r]+)", " ", sent).strip()
            cleaned_sentences.append(sent)
        return cleaned_sentences


    def deep_clean(self, input_sentences):
        cleaned_sentences = []
        for sent in input_sentences:
            # normalize text to "utf-8" encoding
            sent = unicodedata.normalize('NFKD', str(sent)).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # lowercasing
            sent = str(sent).strip().lower()

            # <----------------------------- CUSTOM CLEANING ----------------------------- >
            #
            # *** Mark important keywords such as: Domain specific, Question words(wh-words), etc, using 
            # "self.vocab_list". Words from this list if found in any input sentence shall be joined using 
            # a key (self.preserve_key) during pre-processing step, and later un-joined to retain them.
            #
            if self.preseve: 
                sent = self.reserve_keywords_from_cleaning(sent, reset=False)
            #
            # <----------------------------- CUSTOM CLEANING ----------------------------- >

            # remove Emojis
            sent = self.emojis.sub(r'', sent)
            # remove emoticons
            sent = self.emoticons.sub(r'', sent)
            # remove common chat-words
            sent = " ".join([self.chat_words_map_dict[w.upper()] if w.upper() in self.chat_words_list else w for w in sent.split()])
            # FIX text
            sent = ftfy.fix_text(sent)
            # Normalize accented chars
            sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Removing <…> web scrape tags
            sent = re.sub(r"\<(.*?)\>", " ", sent)
            # Expanding contractions using contractions_file
            sent = re.sub(r"(\w+\'\w+)", lambda x: self.contractions.get(x.group().lower(), x.group().lower()), sent)
            # Removing web urls
            sent = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0–9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", sent)
            # Removing date formats
            sent = re.sub(r"(\d{4}\-\d{2}\-\d{2}\s\d{2}\:\d{2}\:\d{2}\s\:)", " ", sent)

            # <----------------------------- OPTIONAL CLEANING ----------------------------- >
            #
            # removing punctuations 🔥🔥
            # *** disable them, when sentence structure needs to be retained ***
            sent = re.sub(r"[\$|\#\@\*\%]+\d+[\$|\#\@\*\%]+", " ", sent)
            sent = re.sub(r"\'s", " \'s", sent)
            sent = re.sub(r"\'ve", " \'ve", sent)
            sent = re.sub(r"n\'t", " n\'t", sent)
            sent = re.sub(r"\'re", " \'re", sent)
            sent = re.sub(r"\'d", " \'d", sent)
            sent = re.sub(r"\'ll", " \'ll", sent)
            sent = re.sub(r"[\/,\@,\#,\\,\{,\},\(,\),\[,\],\$,\%,\^,\&,\*,\<,\>]", " ", sent)
            sent = re.sub(r"[\,,\;,\:,\-]", " ", sent)      # main puncts
            
            # remove sentence de-limitters 🔥🔥
            # *** disable them, when sentence boundary/ending is important ***
            # sent = re.sub(r"[\!,\?,\.]", " ", sent)

            # keep only text & numbers 🔥🔥
            # *** enable them, when only text and numbers matter! *** 
            # sent = re.sub(r"\s+", " ", re.sub(r"[\\|\/|\||\{|\}|\[|\]\(|\)]+", " ", re.sub(r"[^A-z0-9]", " ", str(sent))))
            
            # correct spelling mistakes 🔥🔥
            # *** enable them when english spelling mistakes matter *** 
            # sent = " ".join([self.spell_checker.correction(w) if w in self.spell_checker.unknown(sent.split()) else w for w in sent.split()])
            #
            # <----------------------------- OPTIONAL CLEANING ----------------------------- >

            # Remove stopwords
            sent = " ".join(token.text for token in nlp(sent) if token.text not in self.stopwords and 
                                                                 token.lemma_ not in self.stopwords)
            # Lemmatize
            if self.do_lemma:
                sent = " ".join(token.lemma_ for token in nlp(sent))
            # Removing extra whitespaces
            sent = re.sub(r"([\s\n\t\r]+)", " ", sent).lower().strip()

            # <----------------------------- CUSTOM CLEANING ----------------------------- >
            #
            # *** Reverse the custom joining now to un-join the special words found!
            if self.preseve: 
                sent = self.reserve_keywords_from_cleaning(sent, reset=True)
            # <----------------------------- CUSTOM CLEANING ----------------------------- >

            cleaned_sentences.append(sent.strip().lower())
        return cleaned_sentences


    def spacy_get_pos_list(self, results):
        word_list, pos_list, lemma_list, ner_list, start_end_list = [], [], [], [], []
        indices = results['sentences']
        for line in indices:
            tokens = line['tokens']
            for token in tokens:
                # (1). save tokens
                word_list.append(token['word'])
                # (2). save pos
                pos_list.append(token['pos'])
                # (3). save lemmas
                lemma = token['lemma'].lower()
                if lemma in self.stopwords: continue
                lemma_list.append(lemma)
                # (4). save NER
                ner_list.append(token['ner'])
                # (5). save start
                start_end_list.append(str(token['characterOffsetBegin']) + "_" + str(token['characterOffsetEnd']))
        output = {"word_list": word_list, 
                  "lemma_list": lemma_list, 
                  "token_start_end_list": start_end_list,
                  "pos_list": pos_list, "ner_list": ner_list}
        return output

    def spacy_generate_features(self, doc, operations='tokenize,ssplit,pos,lemma,ner'):
        """
        Spacy nlp pipeline to generate features such as pos, tokens, ner, dependency. Accepts doc=nlp(text)
        """
        # spacy doc
        doc_json = doc.to_json()  # Includes all operations given by spacy pipeline

        # Get text
        text = doc_json['text']

        # ---------------------------------------- OPERATIONS  ---------------------------------------- #
        # 1. Extract Entity List
        entity_list = doc_json["ents"]

        # 2. Create token lib
        token_lib = {token["id"]: token for token in doc_json["tokens"]}

        # init output json
        output_json = {}
        output_json["sentences"] = []

        # Perform spacy operations on each sent in text
        for i, sentence in enumerate(doc_json["sents"]):
            # init parsers
            parse = ""
            basicDependencies = []
            enhancedDependencies = []
            enhancedPlusPlusDependencies = []

            # init output json
            out_sentence = {"index": i, "line": 1, "tokens": []}
            output_json["sentences"].append(out_sentence)

            # 3. Split sentences by indices(i), add labels (pos, ner, dep, etc.)
            for token in doc_json["tokens"]:

                if sentence["start"] <= token["start"] and token["end"] <= sentence["end"]:
                    
                    # >>> Extract Entity label
                    ner = "O"
                    for entity in entity_list:
                        if entity["start"] <= token["start"] and token["end"] <= entity["end"]:
                            ner = entity["label"]

                    # >>> Extract dependency info
                    dep = token["dep"]
                    governor = 0 if token["head"] == token["id"] else (token["head"] + 1)  # CoreNLP index = pipeline index +1
                    governorGloss = "ROOT" if token["head"] == token["id"] else text[token_lib[token["head"]]["start"]:
                                                                                     token_lib[token["head"]]["end"]]
                    dependent = token["id"] + 1
                    dependentGloss = text[token["start"]:token["end"]]

                    # >>> Extract lemma
                    lemma = doc[token["id"]].lemma_

                    # 4. Add dependencies
                    basicDependencies.append({"dep": dep,
                                              "governor": governor,
                                              "governorGloss": governorGloss,
                                              "dependent": dependent,
                                              "dependentGloss": dependentGloss})
                    # 5. Add tokens
                    out_token = {"index": token["id"] + 1,
                                 "word": dependentGloss,
                                 "originalText": dependentGloss,
                                 "characterOffsetBegin": token["start"],
                                 "characterOffsetEnd": token["end"]}

                    # 6. Add lemmas
                    if "lemma" in operations:
                        out_token["lemma"] = lemma

                    # 7. Add POS tagging
                    if "pos" in operations:
                        out_token["pos"] = token["tag"]

                    # 8. Add NER
                    if "ner" in operations:
                        out_token["ner"] = ner

                    # Update output json
                    out_sentence["tokens"].append(out_token)

            # 9. Add dependencies operation
            if "parse" in operations:
                out_sentence["parse"] = parse
                out_sentence["basicDependencies"] = basicDependencies
                out_sentence["enhancedDependencies"] = out_sentence["basicDependencies"]
                out_sentence["enhancedPlusPlusDependencies"] = out_sentence["basicDependencies"]
        # ---------------------------------------- OPERATIONS  ---------------------------------------- #
        return output_json
    
    def spacy_clean(self, input_sentences):
        batch_size = min(int(np.ceil(len(input_sentences)/100)), 500)
        
        # Part 1: generate spacy textual features (pos, ner, lemma, dependencies)
        sentences = [self.spacy_generate_features(doc) for doc in nlp.pipe(input_sentences, batch_size=batch_size, n_process=-1)]
        
        # Part 2: collect all the features for each sentence
        spacy_sentences = [self.spacy_get_pos_list(sent) for sent in sentences]

        return spacy_sentences


    ## MAIN ##
    def run_pipeline(self, sentences, operation):
        """
        Main module to execute pipeline. Accepts list of strings, and desired operation.
        """
        if operation=="":
            raise Exception("Please pass a cleaning type - `basic`, `deep` or `spacy` !!")

        # run basic cleaning
        if "basic" == operation.lower(): 
            return self.basic_clean(sentences)

        # run deep cleaning
        if "deep" == operation.lower(): 
            return self.deep_clean(sentences)

        # run spacy pipeline
        if "spacy" == operation.lower(): 
            return self.spacy_clean(sentences)

#### Execute

In [26]:
## settings ##

"""
CUSTOM VOCABULARY ::

- List of words you wish to mark and retain them across the preprocessing steps - very important!
- Example, task-specific, domain-specific keywords.

"""

custom_vocab = ["who", "what", "where", "when", "would", "which", "how", "why", "can", "may", 
                "will", "won't", "does", "does not","doesn't", "do", "do i", "do you", "is it", "would you", 
                "is there", "are there", "is it so", "is this true", "to know", "is that true", "are we", 
                "am i", "question is", "can i", "can we", "tell me", "can you explain", "how ain't", 
                "question", "answer", "questions", "answers", "ask", "can you tell"]


"""
Utilities:
- Truncate words to their root-known-word form, stripping off their adjectives, verbs, etc. (Example: "running" becomes "run", "is" becomes "be")
- different from stemmer (PorterStemmer)
- Can use regex based stemming..
- Check Spacy's dependency parsing

"""

do_lemmatizing = True
#do_chinking = False
#do_chunking = False
#do_dependencyParser = False

In [27]:
## Preprocessing ##

preprocessText_obj = preprocessText(resources_dir_path, custom_vocab, do_lemmatizing)

def cleaning(data, text_col):
    data["Basic_%s" % text_col] = preprocessText_obj.run_pipeline(data[text_col], "basic")
    data["Deep_%s" % text_col] = preprocessText_obj.run_pipeline(data[text_col], "deep")
    data["Spacy_%s" % text_col] = preprocessText_obj.run_pipeline(data[text_col], "spacy")
    return data


## SAMPLE
# df = cleaning(df, <_TEXT_COLUMN_>)

In [28]:
## Execute ##

df = pd.DataFrame({"TEXT": ['hello i am good', "hello bye!"]})
cleaning(df, "TEXT")

Unnamed: 0,TEXT,Basic_TEXT,Deep_TEXT,Spacy_TEXT
0,hello i am good,hello i am good,hello good,"{'word_list': ['hello', 'i', 'am', 'good'], 'l..."
1,hello bye!,hello bye!,hello bye,"{'word_list': ['hello', 'bye', '!'], 'lemma_li..."


---
---

##  Vectorization Unit (Unit 2/3)

- Loads trained/pretrained embeddings models.
- Vectorizes text using user defiend model.
- Pre-requsite: need to have pre-trained/trained/tuned embedding models in ./models/ directory before running this.

In [32]:
## define a list of vectorizers to be used for task at hand

VECTORIZER_LIST = ['count', 'tfidf', 'word2vec', 'fasttext', 
                   'home_trained_word2vec', 'home_trained_fasttext', 
                   'pretrained_glove', 'pretrained_google', 'pretrained_fasttext', 
                   'word2vec_glove', 'word2vec_google', 
                   'USE', 'BERT']

In [33]:
class Embedding:
    """
    Used to load models and then vectorize text using a passed choice of vectorizer - 'source'.
    """

    def __init__(self):
        self.model = {}
        self.modelpath = {}
        self.vectorizer = {}
        self.vectorizer_list = VECTORIZER_LIST

    def convert(self, source, input_fp, output_fp):
        """
        If need to convert GLOVE text file into vector format.
        e.g. convert(source='glove', input_fp=downloaded_glove_file_path, output_fp=save_file_path)
        """
        if source == 'glove':
            input_file, output_file = datapath(input_fp), get_tmpfile(output_fp)
            glove2word2vec(input_file, output_file)
        else:
            raise ValueError('ERROR :: You can convert only glove text file!')

    def load(self, source, fp):
        """
        Loads a trained vectorizer model from a given file-path.
        """
        
        if source not in self.vectorizer_list or not fp:
            raise ValueError('ERROR :: Pass a eligible source from VECTORIZER_LIST options provided!')
        
        if source in ['count', 'tfidf', 'word2vec', 'fasttext']:
            # no trained model for these
            return
       
        if source == 'home_trained_word2vec':
            self.model[source] = Word2Vec.load(fp)
            self.model[source].init_sims(replace=True)
        
        elif source == 'home_trained_fasttext':
            self.model[source] = FastText.load(fp)
            self.model[source].init_sims(replace=True)
        
        elif source == 'pretrained_glove':
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(fp, unicode_errors='ignore')
        
        elif source == 'pretrained_google':
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(fp, binary=True, unicode_errors='ignore')
        
        elif source == 'pretrained_fasttext':
            self.model[source] = gensim.models.wrappers.FastText.load_fasttext_format(fp)
        
        elif source == 'word2vec_glove':
            self.model[source] = Word2Vec.load(fp)
            self.model[source].init_sims(replace=True)
        
        elif source == 'word2vec_google':
            self.model[source] = Word2Vec.load(fp)
            self.model[source].init_sims(replace=True)
        
        elif source == 'USE':
            self.model[source] = hub.KerasLayer(fp, trainable=True)
            
        elif source == 'BERT':
            self.model[source] = {}
            self.model[source]['tokenizer'] = AutoTokenizer.from_pretrained(fp)
            self.model[source]['model'] = AutoModel.from_pretrained(fp)
            self.model[source]['max_length'] = 128
            self.model[source]['embedding_func'] = lambda x: x[0][:, 0, :].squeeze()

        else:
            raise Exception('ERROR :: Incorrect source passed!')

        self.modelpath[source] = fp
        print(source, " - loaded.")
        return

    def get_model(self, source):
        if source not in self.vectorizer_list:
            raise ValueError('Possible value of source are:{}'.format(",".join(self.vectorizer_list)))
        return self.model[source]

    def get_model_path(self, source):
        if source not in self.vectorizer_list:
            raise ValueError('Possible value of source are:{}'.format(",".join(self.vectorizer_list)))
        return self.modelpath[source]

    def get_words(self, source, size=None):
        if source in self.trained_embed_models:
            if size is None:
                return [w for w in self.get_model(source=source).vocab]
            elif size is None:
                return [w for w in self.get_model(source=source).vocab]
            else:
                results = []
                for i, word in enumerate(self.get_model(source=source).vocab):
                    if i >= size:
                        break
                    results.append(word)
                return results
        elif source in ['fasttext', 'pretrained_fasttext']:
            if size is None:
                return [w for w in self.get_model(source=source).wv.vocab]
            else:
                results = []
                for i, word in enumerate(self.get_model(source=source).wv.vocab):
                    if i >= size:
                        break
                    results.append(word)
                return results
        else:
            raise ValueError('Only embedding models are allowed!')

    def get_dimension(self, source):
        if source in self.trained_embed_models:
            return self.get_model(source=source).vectors[0].shape[0]
        elif source in ['fasttext', 'pretrained_fasttext']:
            word = self.get_words(source=source, size=1)[0]
            return self.get_model(source=source).wv[word].shape[0]
        else:
            raise ValueError('Only embedding models are allowed!')

    def get_vectors(self, source, words=None):
        # vectorize tokens
        if source in self.trained_embed_models:
            if words is None:
                words = self.get_words(source=source)
            embedding = np.empty((len(words), self.get_dimension(source=source)), dtype=np.float32)
            for i, word in enumerate(words):
                embedding[i] = self.get_vector(source=source, word=word)
            return embedding
        else:
            raise ValueError('Only embedding models are allowed!')

    def get_vector(self, source, word, oov=None):
        if source not in self.trained_embed_models:
            raise ValueError('Only embedding models are allowed!')
        if source not in self.model:
            raise ValueError('Did not load %s model yet' % source)
        try:
            return self.model[source][word]
        except KeyError as e:
            raise

    def get_synonym(self, source, word, oov=None):
        if source not in self.trained_embed_models:
            raise ValueError('Only embedding models are allowed!')
        if source not in self.model:
            raise ValueError('Did not load %s model yet' % source)
        try:
            return self.model[source].most_similar(positive=word, topn=5)
        except KeyError as e:
            raise

    def which_distance_between_two_words(self, source, word1, word2, oov=None):
        if source not in self.trained_embed_models:
            raise ValueError('Only embedding models are allowed!')
        if source not in self.model:
            raise ValueError('Did not load %s model yet' % source)
        try:
            return self.model[source].similarity(word1, word2)
        except KeyError as e:
            raise

    def vectorize(self, file_1_sents, file_2_sents, vectorizers):
        """
        Vectorizes (and/or trains) 2 files: 'Source' & 'Target' containing processed sentences using one/many vectorizers.
        
        file_1_sents  : List of **processed** sentences from File 1 (list of pd.Series)
        file_2_sents  : List of **processed** sentences from File 2 (list of pd.Series)
        vectorizers   : List of choice of vectorization models (e.g. ['count', 'USE', 'BERT'])
        
        """
        
        if isinstance(file_1_sents, pd.Series):
            file_1_sents = file_1_sents.tolist()
        if isinstance(file_2_sents, pd.Series):
            file_2_sents = file_2_sents.tolist()
            
        # cal size
        file_1_size = len(file_1_sents)
        file_2_size = len(file_2_sents)

        # tokenize
        corpus = file_1_sents + file_2_sents
        tokenized_sentences = [sent.split() for sent in corpus]
        # vocab = list(set([word for sent in tokenized_sentences for word in sent]))

        self.vectorizer = {}
        for source in vectorizers:
            
            print("\nVectorizing using ::", source)
            
            # trains and vectorizes
            if source == 'count':
                # Train
                print('Training...')
                start = time.time()
                model = ""
                model = CountVectorizer(ngram_range=(1, 1))
                vectors = model.fit_transform(corpus)
                # Vectorize
                print('Vectorizing...')
                file_1_vectors = vectors[0:file_1_size, :]
                file_2_vectors = vectors[file_1_size:, :]
                # Save
                self.vectorizer[source] = {"model":model, "file_1_vectors":file_1_vectors, "file_2_vectors":file_2_vectors}
                print('Time ms:{}'.format(time.time() - start))

            # trains and vectorizes
            elif source == 'tfidf':
                # Train
                print('Training...')
                start = time.time()
                model = ""
                model = TfidfVectorizer(use_idf=True, ngram_range=(1, 1))
                vectors = model.fit_transform(corpus)
                # Vectorize
                print('Vectorizing...')
                file_1_vectors = vectors[0:file_1_size, :]
                file_2_vectors = vectors[file_1_size:, :]
                # Save
                self.vectorizer[source] = {"model":model, "file_1_vectors":file_1_vectors, "file_2_vectors":file_2_vectors}
                print('Time ms:{}'.format(time.time() - start))
            
            # trains and vectorizes
            elif source == 'word2vec':
                # Train
                print('Training...')
                start = time.time()
                model = ""
                model = Word2Vec(size=300, min_count=1, workers=40)
                model.build_vocab(tokenized_sentences, progress_per=10000)
                model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=model.iter)
                # Vectorize
                print('Vectorizing...')
                file_1_vectors = [np.mean([model.wv[w] if w in model.wv.vocab else np.ones(model.vector_size) for w in sent.split()], axis=0) for sent in file_1_sents]
                file_2_vectors = [np.mean([model.wv[w] if w in model.wv.vocab else np.ones(model.vector_size) for w in sent.split()], axis=0) for sent in file_2_sents]
                # Save
                # model.save(filepath)
                self.vectorizer[source] = {"model":model, "file_1_vectors":file_1_vectors, "file_2_vectors":file_2_vectors}
                print('Time ms:{}'.format(time.time() - start))
            
            # trains and vectorizes
            elif source == 'fasttext':
                # Train
                print('Training...')
                start = time.time()
                model = ""
                model = FastText(size=200, window=5, min_count=1, workers=40)
                model.build_vocab(tokenized_sentences, progress_per=10000)
                model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=model.epochs)
                # Vectorize
                print('Vectorizing...')
                file_1_vectors = [np.mean([model.wv[w] if w in model.wv.vocab else np.ones(model.vector_size) for w in sent.split()], axis=0) for sent in file_1_sents]
                file_2_vectors = [np.mean([model.wv[w] if w in model.wv.vocab else np.ones(model.vector_size) for w in sent.split()], axis=0) for sent in file_2_sents]
                # Save
                # model.save(filepath)
                self.vectorizer[source] = {"model":model, "file_1_vectors":file_1_vectors, "file_2_vectors":file_2_vectors}
                print('Time ms:{}'.format(time.time() - start))
            
            # vectorizes
            elif source in ['homemade_word2vec', 'homemade_fasttext', 'pretrained_glove', 'pretrained_google', 
                            'pretrained_fasttext']:
                
                if source not in self.model:
                    raise ValueError('Did not load %s model yet' % source)
                # -- No training required - using loaded models! --
                model = ""
                model = self.model[source]
                # Vectorize
                print('Vectorizing...')
                start = time.time()
                file_1_vectors = [np.mean([model.wv[w] if w in model.wv.vocab else np.ones(model.vector_size) for w in sent.split()], axis=0) for sent in file_1_sents]
                file_2_vectors = [np.mean([model.wv[w] if w in model.wv.vocab else np.ones(model.vector_size) for w in sent.split()], axis=0) for sent in file_2_sents]
                # Save
                self.vectorizer[source] = {"model":model, "file_1_vectors":file_1_vectors, "file_2_vectors":file_2_vectors}
                print('Time ms:{}'.format(time.time() - start))
            
            # trains and vectorizes
            elif source in ['word2vec_glove', 'word2vec_google']:
                # pre-trained model name (glove or google)
                pretained_source = "pretrained_{}".format(source.split("_")[1])
                if pretained_source not in self.model:
                    raise ValueError('Did not load %s model yet' % pretained_source)
                # load pre-trained model first
                pretrained_model = self.model[pretained_source]
                pretrained_model_fp = self.modelpath[pretained_source]
                embedding_size = pretrained_model.vector_size
                binary_value = False
                if pretained_source == 'pretrained_google':
                    binary_value = True
                # Train
                print('Training...')
                start = time.time()
                model = ""
                model = Word2Vec(size=embedding_size, min_count=1, workers=40)
                model.build_vocab(tokenized_sentences, progress_per=10000)
                model.build_vocab([list(pretrained_model.vocab.keys())], update=True)                  # transfer learning
                model.intersect_word2vec_format(pretrained_model_fp, binary=binary_value, lockf=1.0)   # transfer learning
                model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=model.iter)
                # Vectorize
                print('Vectorizing...')
                file_1_vectors = [np.mean([model.wv[w] if w in model.wv.vocab else np.ones(model.vector_size) for w in sent.split()], axis=0) for sent in file_1_sents]
                file_2_vectors = [np.mean([model.wv[w] if w in model.wv.vocab else np.ones(model.vector_size) for w in sent.split()], axis=0) for sent in file_2_sents]
                # Save
                self.vectorizer[source] = {"model":model, "file_1_vectors":file_1_vectors, "file_2_vectors":file_2_vectors}
                print('Time ms:{}'.format(time.time() - start))
            
            # vectorizes
            elif source == 'USE':
                
                if source not in self.model:
                    raise ValueError('Did not load %s model yet' % source)
                
                use_model = self.model[source]
                
                def embed(lst):
                    chunk_size = 5000
                    batches = [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
                    arr=[]
                    for x in tqdm.tqdm(batches):
                        arr.append(use_model(x))
                    return np.concatenate(arr)
                
                print('Vectorizing...')
                start = time.time()
                file_1_vectors = embed(file_1_sents)
                file_2_vectors = embed(file_2_sents)
                # Save
                self.vectorizer[source] = {"model": model, 
                                           "file_1_vectors": file_1_vectors, 
                                           "file_2_vectors": file_2_vectors}
                print('Time ms:{}'.format(time.time() - start))
            
            # vectorizes
            elif source == 'BERT':
                
                if source not in self.model:
                    raise ValueError('Did not load %s model yet' % source)

                tokenizer = self.model[source]['tokenizer']
                model = self.model[source]['model']
                model.eval()
                max_length = self.model[source]['max_length']
                embedding_func = self.model[source]['embedding_func']

                # Mean Pooling - Take attention mask into account for correct averaging
                def mean_pooling(model_output, attention_mask):
                    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
                    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
                    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
                    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
                    return sum_embeddings / sum_mask

                # Tokenize the text with the provided tokenizer
                encoded_input_1 = tokenizer(file_1_sents, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
                encoded_input_2 = tokenizer(file_2_sents, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

                # Compute token embeddings
                with torch.no_grad():
                    model_output_1 = model(**encoded_input_1)
                    model_output_2 = model(**encoded_input_2)

                # Perform mean pooling
                print('Vectorizing...')
                start = time.time()
                file_1_vectors = mean_pooling(model_output_1, encoded_input_1['attention_mask'])
                file_2_vectors = mean_pooling(model_output_2, encoded_input_2['attention_mask'])
                # Save
                self.vectorizer[source] = {"model": model, 
                                           "file_1_vectors": file_1_vectors, 
                                           "file_2_vectors": file_2_vectors}
                print('Time ms:{}'.format(time.time() - start))

                            
            else:
                raise ValueError('Possible value of vectorizers are:{}'.format(",".join(self.vectorizer_list)))

        return self.vectorizer

#### Execute

** Remember to load trained embedding models into memory before using them here! Use VEC.load()

- Input File 1: List of raw Source Sentences.
- Input File 2: List of raw Target Sentences.


- Preprocessing + Vectorization
- Outputs a dict file with ID, Original, Processed and Vectors for both files.

In [33]:
## Load existing embed models ::

VEC = Embedding()

# load some pretrained embedding models like Glove, Google, FastText, etc.
VEC.load(source='pretrained_glove', fp=pretrained_glove_fp)
VEC.load(source='pretrained_google', fp=pretrained_google_fp)
VEC.load(source='pretrained_fasttext', fp=pretrained_fasttext_fp)

# load some in-house trained models like word2vec, doc2vec, word2vec intersected with glovem, etc.
VEC.load(source='trained_word2vec', fp=trained_word2vec_fp)
VEC.load(source='trained_word2vec_glove', fp=trained_word2vec_glove_fp)
VEC.load(source='trained_word2vec_google', fp=trained_word2vec_google_fp)
VEC.load(source='trained_fasttext', fp=trained_fasttext_fp)

# load some fine-tuned models on domain data
VEC.load(source='tuned_USE', fp=tuned_use_fp)
VEC.load(source='tuned_sBERT', fp=tuned_sbert_fp)

print("**Trained embedding models loaded into memory!**")

BERT  - loaded.
**Trained embedding models loaded into memory!**


In [None]:
## Execute ::


f1_sents = ["how do i close account", "how do i close all the accounts"]
f2_sents = ["how do i close account"]

#1. Get Cosine Similarity b/w two files
res = vect.vectorize(f1_sents, f2_sents, vectorizers=['count', 'tfidf'])
sim_matrix = cosine_similarity(res['tfidf']['file_1_vectors'], res['tfidf']['file_2_vectors'])
for i, f1_sent in enumerate(f1_sents):
    for j, f2_sent in enumerate(f2_sents):
        print(f1_sent, "---", f2_sent, sim_matrix[i][j])

# 2. Get vectors
for source in ['pretrained_glove', 'pretrained_google', 'pretrained_fasttext', 'home_trained_fasttext']:
    print('Source: %s' % (source))
    print(word_embedding.get_vector(source=source, word='fail'))
    print(len(word_embedding.get_vector(source=source, word='fail')))

# 3. Get Most Similar Words
for source in ['pretrained_glove', 'pretrained_google', 'pretrained_fasttext', 'home_trained_fasttext']:
    print('Source: %s' % (source))
    print(word_embedding.get_synonym(source=source, word='fail'))

# 4. Get Distance
# check WMD documentation.
w1 = 'king'
w2 = 'queen'
for source in ['pretrained_glove', 'pretrained_google', 'pretrained_fasttext', 'home_trained_fasttext']:
    print('Source: %s' % (source))
    print(word_embedding.which_distance_between_two_words(source=source,word1=w1, word2=w2))

---
---

## Text Similarity Unit (Unit 3/3)

Input data for every method would be 2 files in dict format. File dict will be file['sentences'] & file['vectors']

    - file['sentences'] == list of spacy preprocessed sentences
    - file['vectors']   == dict of vectorized sentences, keys are: 'count', 'tfidf', 'word2vec', ... etc

**Techniques tested:**
    
1. Bilingual Evaluation Understudy (BLEU)
2. Levenshtein
3. TFIDF + Euclidean Distance
4. TFIDF + Cosine Distance
5. Jaccard Distance
6. Embeddings + Cosine Distance
7. Embeddings + Smooth Inverse Frequency + Cosine Distance
8. Spacy Vectorizer + Smooth Inverse Frequency + Cosine Distance
9. Embeddings + Word Movers Distance
10. LDA + Jannon-Shenon Distance
11. Trained Attention LSTM Inception AutoEncoder Network Embeddings + Cosine Distance (**3rd BEST**)
12. Finetuned Universal Sentence Encoder(USE) + Cosine Distance (**2nd BEST**)
13. Fine-tuned Ultra-Fast Sentence Bert + Cosine Distance (**BEST**)
14. Siamese Deep Neural Network (Pending)

### 1. BLEU Similairty
- 1.0 = most similar; 0.0 = least similar

In [176]:
def bleu(output_dict):

    source_sentences = output_dict['file1']['Source_processed']
    target_sentences = output_dict['file2']['Target_processed']

    list_score = []
    for i, source in tqdm.tqdm(enumerate(source_sentences), total=len(source_sentences)):
        for j, target in enumerate(target_sentences):
            # BLEU i.e. sentence_bleu(ref=['query'], res='response')
            score = sentence_bleu([source], target)
            list_score.append(score)

    return formatScore(list_score)

### 2. Levenshtein Similairty
- 1.0 = most similar; 0.0 = least similar

In [None]:
def levenshtein(output_dict):

    source_sentences = output_dict['file1']['Source_processed']
    target_sentences = output_dict['file2']['Target_processed']

    list_score = []
    for i, source in tqdm.tqdm(enumerate(source_sentences), total=len(source_sentences)):
        for j, target in enumerate(target_sentences):
            # fuzzy i.e. fuzz.ratio(s1, s2)
            score = fuzz.ratio(source, target)/100.0
            list_score.append(score)

    return formatScore(list_score)

### 3. TFIDF + Euclidean based Similarity

- 1.0 = most similar; 0.0 = least similar

- Euclidean distance: if the distance is **small** then words in the two sentences are **close** to each other.
- Euclidean distance based similarity = 1/(1 + euclidean_distance(v1, v2))

In [None]:
def tfidf_euclidean(output_dict):

    source_vectors = output_dict['Vectorization']['tfidf']['file_1_vectors']
    target_vectors = output_dict['Vectorization']['tfidf']['file_2_vectors']

    # euclidean similarity = 1/[1 + euclidean_distance(v1, v2)]
    sim_matrix = euclidean_distances(source_vectors, target_vectors)
    list_score = sim_matrix.reshape(-1)
    list_score = list(map(lambda x: 1/(1.0 + x), list_score))

    return formatScore(list_score)

### 4. TFIDF + Cosine Distance

- Cosine: 1.0 = most similar; 0.0 = least similar

In [None]:
def tfidf_cosine(output_dict):

    source_vectors = output_dict['Vectorization']['tfidf']['file_1_vectors']
    target_vectors = output_dict['Vectorization']['tfidf']['file_2_vectors']

    # cosine is cosine_similarity(v1, v2)
    sim_matrix = cosine_similarity(source_vectors, target_vectors)
    list_score = sim_matrix.reshape(-1)

    return formatScore(list_score)

### 5. Jaccard Similarity

- Jaccard: 1.0 = most similar; 0.0 = least similar

In [None]:
def jaccard(output_dict):

    def jaccard_similarity(query, document):
        # accepts tokenized sentences only
        intersection = set(query).intersection(set(document))
        union = set(query).union(set(document))
        return len(intersection)/len(union)

    source_sentences = output_dict['file1']['Source_processed']
    target_sentences = output_dict['file2']['Target_processed']

    list_score = []
    for i, source in tqdm.tqdm(enumerate(source_sentences), total=len(source_sentences)):
        for j, target in enumerate(target_sentences):
            # jaccard sim = jaccard_similarity(tokenized_doc_1, tokenized_doc_2)
            score = jaccard_similarity(source.split(), target.split())
            list_score.append(score)

    return formatScore(list_score)

### 6. Embeddings + Cosine distance
- 1.0 = most similar; 0.0 = least similar
- embeddings = ['word2vec', 'fasttext', 'pretrained_homemade', 'pretrained_glove', 'pretrained_google', 'pretrained_fasttext']

In [None]:
def embedding_cosine(output_dict, vectorizer=None):

    # check errors
    if vectorizer is None:
        raise ValueError('Pass a valid vectorizer!')
    elif vectorizer not in output_dict['Vectorization'].keys():
        raise ValueError("Load vectorizer first, use 'load_data(f1,f2, vectorizers=[{}])', when loading!".format(vectorizer))
    else:
        pass

    source_vectors = output_dict['Vectorization'][vectorizer]['file_1_vectors']
    target_vectors = output_dict['Vectorization'][vectorizer]['file_2_vectors']

    # cosine = cosine_similarity(v1, v2)
    sim_matrix = cosine_similarity(source_vectors, target_vectors)
    list_score = sim_matrix.reshape(-1)
    return formatScore(list_score)

### 7.   Embeddings +  Smooth Inverse Frequency + Cosine Similarity
- 1.0 = most similar; 0.0 = least similar
- embeddings = ['word2vec', 'fasttext', 'pretrained_homemade', 'pretrained_glove', 'pretrained_google', 'pretrained_fasttext']

In [None]:
def embedding_sif_cosine(output_dict, vectorizer=None):

    def sentence2vec(sentence_list, model):
        # tokenize sentences (with words in model's vocab)
        tokenised_sentence_list = [[word for word in sent.split() if word in model.wv.vocab] for sent in sentence_list]

        # rare case when not a single word was found in vocab
        for index, element in enumerate(tokenised_sentence_list):
            if element == []:
                tokenised_sentence_list[index] = ["unknown"]

        # use SIF to get sentence vectors
        word_counts = Counter(itertools.chain(*tokenised_sentence_list))
        embedding_size = model.vector_size
        a = 0.001

        sentence_set = []
        for sentence in tokenised_sentence_list:
            vs = np.zeros(embedding_size)
            sentence_length = len(sentence)
            for word in sentence:
                a_value = a / (a + word_counts[word])                         # smooth inverse frequency, SIF
                vs = np.add(vs, np.multiply(a_value, model.wv[word]))         # vs += sif * word_vector
            vs = np.divide(vs, sentence_length)                               # weighted average
            sentence_set.append(vs)
        return sentence_set

    # check errors
    if vectorizer is None:
        raise ValueError('Pass a valid vectorizer!')
    elif vectorizer not in output_dict['Vectorization'].keys():
        raise ValueError("Load vectorizer first, use 'load_data(f1,f2, vectorizers=[{}])', when loading!".format(vectorizer))
    else:
        pass

    # get model
    model = output_dict['Vectorization'][vectorizer]['model']

    # get processed sentences
    source_sentences = output_dict['file1']['Source_processed']
    target_sentences = output_dict['file2']['Target_processed']

    # gather all sentences for SIF
    sentence_list = []
    sentence_list.extend(source_sentences)
    sentence_list.extend(target_sentences)

    # get SIF vectors instead
    all_vectors = sentence2vec(sentence_list, model)
    source_vectors_SIF = all_vectors[ :len(source_sentences)]
    target_vectors_SIF = all_vectors[len(source_sentences): ]

    # cosine = cosine_similarity(sif_v1, sif_v2)
    sim_matrix = cosine_similarity(source_vectors_SIF, target_vectors_SIF)
    list_score = sim_matrix.reshape(-1)
    return formatScore(list_score)

### 8.   SPACY + Smooth Inverse Frequency + Cosine Similarity
- 1.0 = most similar; 0.0 = least similar
- Uses spacy large model to get embeddings

In [None]:
def spacy_sif_cosine(output_dict):

    def sentence2vec_PCA(sentence_list, nlp_model):
        # tokenize sentences (with words in spacy nlp's vocab)
        tokenised_sentence_list = [[word for word in sent.split() if word in nlp_model.vocab] for sent in sentence_list]

        # rare case when not a single word was found in vocab
        for index, element in enumerate(tokenised_sentence_list):
            if element == []:
                tokenised_sentence_list[index] = ["unknown"]

        # use SIF to get sentence vectors
        word_counts = Counter(itertools.chain(*tokenised_sentence_list))
        embedding_size = nlp_model.vocab["a"].vector.shape[0]
        a = 0.001

        sentence_set = []
        for sentence in tokenised_sentence_list:
            vs = np.zeros(embedding_size)
            sentence_length = len(sentence)
            for word in sentence:
                a_value = a / (a + word_counts[word])                                  # smooth inverse frequency, SIF
                vs = np.add(vs, np.multiply(a_value, nlp_model.vocab[word].vector))    # vs += sif * word_vector
            vs = np.divide(vs, sentence_length)                                        # weighted average
            sentence_set.append(vs)

        # calculate PCA of this sentence set
        pca = PCA()
        pca.fit(np.array(sentence_set))
        u = pca.components_[0]                                                # PCA vector
        u = np.multiply(u, np.transpose(u))

        # Padding vectors (occurs if we have less sentences than embeddings_size)
        if len(u) < embedding_size:
            for i in range(embedding_size - len(u)):
                u = np.append(u, 0)                                          # add needed extension for multiplication below

        # resulting final sentence vectors
        sentence_vecs = []
        for vs in sentence_set:
            sub = np.multiply(u,vs)
            sentence_vecs.append(np.subtract(vs, sub))

        return sentence_vecs

    # Spacy's large model
    model = nlp

    # get processed sentences
    source_sentences = output_dict['file1']['Source_processed']
    target_sentences = output_dict['file2']['Target_processed']

    # gather all sentences for SIF
    sentence_list = []
    sentence_list.extend(source_sentences)
    sentence_list.extend(target_sentences)

    # get SIF + PCA vectors with spacy
    all_vectors = sentence2vec_PCA(sentence_list, model)
    source_vectors_SIF = all_vectors[ :len(source_sentences)]
    target_vectors_SIF = all_vectors[len(source_sentences): ]

    # cosine = cosine_similarity(sif_v1, sif_v2)
    sim_matrix = cosine_similarity(source_vectors_SIF, target_vectors_SIF)
    list_score = sim_matrix.reshape(-1)
    return formatScore(list_score)

### 9.  Embeddings + Word Mover Distance

- 1.0 = most similar; 0.0 = least similar

---
Word Mover's Distance (WMD) uses the word embeddings of the words in two texts to **measure the minimum distance** that the words in one text need to travel in **semantic space** to reach the words in the other text.

The **WMD** is measured by measuring the minimum **Earth mover's distance** between each word in the two documents in **word2vec/fasttetx** space. if the distance is small then words in the two documents are close to each other.

- Word Mover Distance = WMD(v1, v2)
- Word Mover Distance based similarity = 1/(1 + WMD(v1, v2))

In [None]:
def embeddings_WMD(output_dict, vectorizer):

    def WMD(sent1, sent2):
        wmd_distance = model.wmdistance(sent1.split(), sent2.split())
        wmd_score = 1/(1.0 + wmd_distance)
        return wmd_score

    # get model
    model = output_dict['Vectorization'][vectorizer]['model']

    # processed sentences
    source_sentences = output_dict['file1']['Source_processed']
    target_sentences = output_dict['file2']['Target_processed']

    list_score = []
    for i, source in tqdm.tqdm(enumerate(source_sentences), total=len(source_sentences)):
        for j, target in enumerate(target_sentences):
            # WMD = model.wmdistance(tokenized_doc_1, tokenized_doc_2)
            score = WMD(source, target)
            list_score.append(score)

    return formatScore(list_score)

### 10. LDA + Jensen-Shannon distance

In [None]:
## get processed sentences
# source_sentences = output_dict['file1']['Source_processed']
# target_sentences = output_dict['file2']['Target_processed']

# sentence_list = []
# sentence_list.extend(source_sentences)
# sentence_list.extend(target_sentences)

In [None]:
## Topic modelling using LDA

# def train_LDA(sentence_list):
#     # LDA
#     # We setup parameters like number of topics, the chunksize to use in Hoffman method
#     # We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize

#     # list of tokenized sentences
#     tokenized_sentences = []
#     tokenized_sentences = list(map(str.split, sentence_list))
#     print("Total sentences:", len(tokenized_sentences))

#     start = time.time()
#     num_topics = 100
#     chunksize = 300

#     dictionary = corpora.Dictionary(tokenized_sentences)
#     corpus = [dictionary.doc2bow(doc) for doc in tokenized_sentences]

#     # low alpha means each doc is only represented by a small number of topics, and vice versa
#     # low eta means each topic is only represented by a small number of words, and vice versa
#     lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, alpha=1e-2, eta=0.5e-2,
#                    chunksize=chunksize, minimum_probability=0.0, passes=2)

#     print("Time to train LDA model on ", len(tokenized_sentences), "sentences: ", round((time.time() - start)/60, 4), "min")
#     return lda

In [None]:
# train
# lda = train_LDA(sentence_list)

In [None]:
## Some Topics

In [None]:
# # Show topics with top num_words contributing to that topic

# lda.show_topics(num_topics=2, num_words=5)

In [None]:
# # Show contributing words in a topic (defined by 'topicid')

# lda.show_topic(topicid=87, topn=10)

In [None]:
## Represent a 'sentence' or 'article' in terms of topics

In [None]:
# select_any_index = 1
# tokenized_sent = tokenized_sentences[select_any_index]
# print("Sentence: ", tokenized_sent)

# # get the topic contributions for the document chosen at random above
# bow = dictionary.doc2bow(tokenized_sent)
# doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=bow)])

# # bar plot of topic distribution for this document
# fig, ax = plt.subplots(figsize=(12,6));
# patches = ax.bar(np.arange(len(doc_distribution)), doc_distribution)
# ax.set_xlabel('Topic ID', fontsize=15)
# ax.set_ylabel('Topic Contribution', fontsize=15)
# ax.set_title("Topic Distribution for:- " + " ".join(tokenized_sent), fontsize=20)
# ax.set_xticks(np.linspace(10,100,10))
# fig.tight_layout()
# plt.show()

# # Top 5 contributing topics and their words
# print("> Top Topics")
# for i in doc_distribution.argsort()[-5:][::-1]:
#     print(i, lda.show_topic(topicid=i, topn=3))

#### Calculate JS distance b/w two queries

LDA trained on corpus = source + target
- Tokenize both sentence lists
- Get doc_distribution of each tokenized_sentence using LDA corpus

In [None]:
# def LDA_jensen_shannon(output_dict):
#     def get_topic_distribution(tokenized_sentence):
#         # e.g. tokenized_sentence = ['how', 'open', 'account']
#         bow = dictionary.doc2bow(tokenized_sentence)
#         doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=bow)])
#         return doc_distribution

#     # get processed sentences
#     source_sentences = output_dict['file1']['Source_processed']
#     target_sentences = output_dict['file2']['Target_processed']

#     list_score = []
#     for i, source in tqdm.tqdm(enumerate(source_sentences), total=len(source_sentences)):
#         for j, target in enumerate(target_sentences):

#             # tokenize
#             source_tokenized = source.split()
#             target_tokenized = target.split()

#             # get LDA topic distribution (shape = [1,num_topics])
#             source_distribution = get_topic_distribution(source_tokenized)
#             target_distribution = get_topic_distribution(target_tokenized)

#             # JS distance
#             js_dist = jensenshannon(source_distribution, target_distribution, base=2)
#             if str(js_dist) == "nan":
#                 js_dist = 0
#             score = 1/(1.0 + js_dist)
#             list_score.append(score)

#     return formatScore(list_score)

- Takes approx. 3 mins

### 11. Attention LSTM Inception AutoEncoder Network Embeddings + Cosine Similarity

Note:

Training for this notebook was done in `Training__TextSimilarity__AttentionLST.ipynb`. The network was trained on all utterances + faqs for fasttext embeddings as well as for the network layers.

#### Modules

In [None]:
class Attention(Layer):

    def __init__(self, regularizer=None, **kwargs):
        super(Attention, self).__init__(**kwargs)
        self.regularizer = regularizer
        self.supports_masking = True

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.context = self.add_weight(name='context',
                                       shape=(input_shape[-1], 1),
                                       initializer=initializers.RandomNormal(
                                            mean=0.0, stddev=0.05, seed=None),
                                       regularizer=self.regularizer,
                                       trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x, mask=None):
        attention_in = K.exp(K.squeeze(K.dot(x, self.context), axis=-1))
        attention = attention_in/K.expand_dims(K.sum(attention_in, axis=-1), -1)

        if mask is not None:
            # use only the inputs specified by the mask
            # import pdb; pdb.set_trace()
            attention = attention*K.cast(mask, 'float32')

        weighted_sum = K.batch_dot(K.permute_dimensions(x, [0, 2, 1]), attention)
        return weighted_sum

    def compute_output_shape(self, input_shape):
        print(input_shape)
        return (input_shape[0], input_shape[-1])

In [None]:
def FastTextVec(filename):
    embeddings = {}
    model = FastText.load(filename)
    model.init_sims(replace=True)
    words = list(model.wv.vocab)
    i = 0
    for word in words:
        try:
            coefs = np.asarray(model.wv[word], dtype='float32')
            embeddings[word] = coefs
        except ValueError:
            i += 1
    return embeddings

In [None]:
def createVocabAndData(sentences, max_len):
    sent_nums = []
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    # print('sequences',sequences)
    vocab = tokenizer.word_index
    max_length = max([len(seq) for seq in sequences])
    #print('len(seq)',max_length)
    for seq in sequences:
        sent_nums.append(len(seq))
    data = pad_sequences(sequences, maxlen=max_len)
    return vocab, data, sent_nums, tokenizer

In [None]:
def createEmbeddingMatrix(word_index, embeddings_index):
    nb_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > MAX_NB_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

#### Network Settings

In [None]:
EMBEDDING_DIM = 200
MAX_NB_WORDS = 2000000
TEST_SPLIT = 0.1
VALIDATION_SPLIT = 0.1
max_len = 10

#### Load Models : (1) Embedding & (2) Attention LSTM Autoencoder

In [None]:
""" 1. Load embedding model """

# Select model to load
embed_model_3DR = "../Innovation Tracks/2. Keyword Generation/models/embeddings/3DR_fasttext_200dim_Feb2021.model"
embed_model_UTTERANCES = "models/UTTERANCES_fasttext_200dim_Mar2021.model"

# Load embedding model
load_model_fp = embed_model_UTTERANCES
embeddings = FastTextVec(load_model_fp)


""" 2. Attention LSTM Autoencoder Model """ 

# Select model to load
lstm_model_3DR = "../Innovation Tracks/2. Keyword Generation/models/inception_autoencoder/3DR_model-fasttext_inception_atten_lstmautoencoder.h5"
lstm_model_UTTERANCES = "models/UTTERANCES_model-fasttext_inception_atten_lstmautoencoder.h5"

# Load attn lstm model
load_model_fp = lstm_model_UTTERANCES
with CustomObjectScope({'Attention': Attention}):
    lstm_autoencoder = load_model(load_model_fp)

# get intermediate layer values
layer_name = 'word_attention'
intermediate_layer_model_diff_arch = Model(inputs = lstm_autoencoder.get_layer('input_layer').output,
                                           outputs = lstm_autoencoder.get_layer(layer_name).output)

#### Model Prediciton

- Output 1 (predict_embeddings_intermediate_output): Given a list of sents, predict its embeddings.
- Output 2 (predict_similarity_attention_lstm_autoencoder): Using predicted embeddings from output_1, find cosine similarity

In [196]:
def predict_embeddings_intermediate_output(sentence_list):
    """
    Returns sentence level embeddings.
    """
    if isinstance(sentence_list, list):
        vocab, data, _, _ = createVocabAndData(sentence_list, max_len)    # tokenized text into dict(words: numbers)
        embedding_mat = createEmbeddingMatrix(vocab, embeddings)          # create dict(numbers: embedding) mapping

        # convert each sent into embedding
        df_processed = pd.DataFrame(data)
        text_features = df_processed.columns
        embed = df_processed[text_features].values
        embed = embed.astype('int')
        embed = embedding_mat[embed]

        # predict final network embeddings for each sent
        intermediate_output = intermediate_layer_model_diff_arch.predict(embed)
        normalized_output = preprocessing.normalize(intermediate_output)
        return normalized_output
    else:
        print("Please pass a list!")
              
def predict_similarity_attention_lstm_autoencoder(output_dict):
    """
    Performs similarity metric.
    """
    
    # get processed sentences
    source_sentences = output_dict['file1']['Source_processed']
    target_sentences = output_dict['file2']['Target_processed']

    # predict embeddings for source_sentences
    source_vectors = predict_embeddings_intermediate_output(source_sentences)

    # predict embeddings for target_sentences
    target_vectors = predict_embeddings_intermediate_output(target_sentences)

    # calculate similarity score, cosine_similarity(v1, v2)
    list_score = []
    attn_sim_matrix = cosine_similarity(source_vectors, target_vectors)
    list_score = attn_sim_matrix.reshape(-1)

    return formatScore(list_score)


## SAMPLE

# get network's predicted embeddings at sentence level (with word atteniton)
# embeds = predict_embeddings_intermediate_output(["how do i open account", "how to close account"])

# pass the output_dict containing list of processed sentences to get simialrity metrics
# sim_metrics = attention_lstm_autoencoder(output_dict)

----

### 12. Universal Sentence Encoder(USE) + Cosine Similarity

- Using original google large USE model, finetuning with word addtion is not possible with current arch, hence ignoring.

In [219]:
def USE(output_dict):

    def embed(lst):
        chunk_size = 5000
        batches = [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
        arr=[]
        for x in tqdm.tqdm(batches):
            arr.append(use_model(x))
        return np.concatenate(arr)

    # get processed sentences
    source_sentences = output_dict['file1']['Source_processed']
    target_sentences = output_dict['file2']['Target_processed']

    # get embeddings
    source_embeddings = embed(source_sentences)
    target_embeddings = embed(target_sentences)

    # similarity
    corr = np.inner(source_embeddings, target_embeddings)
    # OR use: corr = cosine_similarity(source_embeddings, target_embeddings)

    list_score = corr.reshape(-1)
    return formatScore(list_score)

- Source embeddings created. 0.290 mins;  shape=(137698, 512)
- Target embeddings created. 0.001 mins;  shape=(966, 512)

- USE -- Time taken (mins): 11.619

### <ins>Execute</ins>

Execute using input lists and check the methods if they are working. Use the next section of final execution.

#### Input Data

*Remember to load trained embedding models into memory before using them here! Use VEC.load()

- `Input File 1`: List of raw Source Sentences.
- `Input File 2`: List of raw Target Sentences.
- `Output`: pd.DataFrame, dict (file with ID, Original, Processed and Vectors for both files)

In [223]:
def load_data(file1, file2, list_of_vectorizers=['tfidf']):
    """
    Module to load two files and vectorize them. Files can be 'pd.Series', pd.DataFrame or '.txt' file with sentences.
    """
    
    print("Processing dataset...")
    # read
    if isinstance(file1, pd.DataFrame):
        file1.rename(columns={list(file1)[0]:'Source'}, inplace = True)
        file2.rename(columns={list(file2)[0]:'Target'}, inplace = True)
    elif isinstance(file1, pd.Series):
        file1 = pd.DataFrame({'Source': file1})
        file2 = pd.DataFrame({'Target': file2})
    elif isinstance(file1, list):
        file1 = pd.DataFrame({'Source': pd.Series(file1)})
        file2 = pd.DataFrame({'Target': pd.Series(file2)})
    elif ntpath.basename(file1).endswith(".txt"):
        with open(file1) as f: fileobject = io.StringIO(f.read().replace('""', '"')); f.close()
        file1 = pd.read_csv(fileobject, sep='\t',  lineterminator='\n', names=['Source']).reset_index(drop=True)
        with open(file2) as f: fileobject = io.StringIO(f.read().replace('""', '"')); f.close()
        file2 = pd.read_csv(fileobject, sep='\t',  lineterminator='\n', names=['Target']).reset_index(drop=True)
    else:
        raise ValueError('Error: Please use dataframe or txt format only!')
    file1['Source_ID'] = [f"file1_{i}" for i in range(len(file1))]
    file2['Target_ID'] = [f"file1_{i}" for i in range(len(file2))]
    
    
    # Pre-processing
    preprocessText = preprocessText_spacy(resources_dir_path)
    file1['Source_processed'] = [" ".join(x["lemma_list"]).lower() \
                                 if len(str(x).strip()) > 2 else None \
                                 for x in preprocessText.run_pipeline(file1.Source.tolist(), operations=['basic', 'deep', 'spacy'])]
    
    file2['Target_processed'] = [" ".join(x["lemma_list"]).lower() \
                                 if len(str(x).strip()) > 2 else None \
                                 for x in preprocessText.run_pipeline(file2.Target.tolist(), operations=['basic', 'deep', 'spacy'])]
    print("> Pre-processing done.")

    
    # Vectorization
    output = {}
    result = VEC.vectorize(file1['Source_processed'].tolist(),
                           file2['Target_processed'].tolist(),
                           list_of_vectorizers)
    output = {
        "file1": file1.to_dict('list'),
        "file2": file2.to_dict('list'),
        "Vectorization": result }
    print("> Vectorization done. Vectors stored in 'dict' format: output['Vectorization'] ")
    
    file1 = pd.DataFrame.from_dict(output['file1'])
    file2 = pd.DataFrame.from_dict(output['file2'])
    file1['key'], file2['key'] = 0, 0
    dataset = file1.merge(file2, how='outer').drop(columns=['key'])

    return dataset, output

def save_result(df, fp):
    # Save large df into multiple split files
    chunk_size = 500000
    num_chunks = len(df) // chunk_size + 1
    for i in tqdm.tqdm(range(num_chunks)):
        fp = fp + "_{0:0=3d}.txt".format(i+1)
        subset = df[i*chunk_size: (i+1)*chunk_size]
        subset.to_csv(fp, header=True, index=None, sep='\t', mode='a')
    print("Process executed and saved. Location:", fp)


## SAMPLE
# df, output_dict = load_data(file_1, file_2, list_of_vectorizers=['tfidf', 'count', 'BERT'])

#### Run

In [224]:
file_1 = ["How do i close UTMA account?", "How do i open IRA account ??"]

file_2 = ["How do i close a trust account?",
          "How to close UTMA account?",
          "How do i open IRA account?" ,
          "Docs required for closing acc"]

df, output_dict = load_data(file_1, file_2, list_of_vectorizers=['tfidf', 'BERT'])

Processing dataset...
stopwords loaded.
synonyms_noun_verb loaded.
contractions loaded.
> Pre-processing done.

 tfidf
Training...
Vectorizing...
Time ms:0.002544403076171875

 BERT
Vectorizing...
Time ms:0.0006747245788574219
> Vectorization done. Vectors stored in 'dict' format: output['Vectorization'] 


In [None]:
start = time.time()

df['BLEU'] = bleu(output_dict)
df['lev'] = levenshtein(output_dict)
df['tfidf_euclidean'] = tfidf_euclidean(output_dict)
df['tfidf_cosine'] = tfidf_cosine(output_dict)
df['jaccard'] = jaccard(output_dict)

df['cos_word2vec'] = embedding_cosine(output_dict, vectorizer="word2vec")
df['cos_fasttext'] = embedding_cosine(output_dict, vectorizer="fasttext")
df['cos_homemade_fasttext'] = embedding_cosine(output_dict, vectorizer="homemade_fasttext")
df['cos_preglove'] = embedding_cosine(output_dict, vectorizer="pretrained_glove")
df['cos_pregoogle'] = embedding_cosine(output_dict, vectorizer="pretrained_google")
df['cos_prefasttext'] = embedding_cosine(output_dict, vectorizer="pretrained_fasttext")
df['cos_w2v_glove'] = embedding_cosine(output_dict, vectorizer="word2vec_glove")
df['cos_w2v_google'] = embedding_cosine(output_dict, vectorizer="word2vec_google")
df['cos_SIF_word2vec'] = embedding_sif_cosine(output_dict, vectorizer="word2vec")
df['cos_SIF_fasttext'] = embedding_sif_cosine(output_dict, vectorizer="fasttext")
df['cos_SIF_homemade_fasttext'] = embedding_sif_cosine(output_dict, vectorizer="homemade_fasttext")
df['cos_SIF_preglove'] = embedding_sif_cosine(output_dict, vectorizer="pretrained_glove")
df['cos_SIF_pregoogle'] = embedding_sif_cosine(output_dict, vectorizer="pretrained_google")
df['cos_SIF_prefasttext'] = embedding_sif_cosine(output_dict, vectorizer="pretrained_fasttext")
df['cos_SIF_w2v_glove'] = embedding_sif_cosine(output_dict, vectorizer="word2vec_glove")
df['cos_SIF_w2v_google'] = embedding_sif_cosine(output_dict, vectorizer="word2vec_google")

df['spacy_SIF_cosine'] = spacy_sif_cosine(output_dict)

df['wmd_word2vec'] = embeddings_WMD(output_dict, vectorizer="word2vec")
df['wmd_fasttext'] = embeddings_WMD(output_dict, vectorizer="fasttext")
df['wmd_homemade_fasttext'] = embeddings_WMD(output_dict, vectorizer="homemade_fasttext")
df['wmd_preglove'] = embeddings_WMD(output_dict, vectorizer="pretrained_glove")
df['wmd_pregoogle'] = embeddings_WMD(output_dict, vectorizer="pretrained_google")
df['wmd_prefasttext'] = embeddings_WMD(output_dict, vectorizer="pretrained_fasttext")
df['wmd_w2v_glove'] = embeddings_WMD(output_dict, vectorizer="word2vec_glove")
df['wmd_w2v_google'] = embeddings_WMD(output_dict, vectorizer="word2vec_google")

df['LDA'] = LDA_jensen_shannon(output_dict)
df['Attn_lstm'] = attention_lstm_autoencoder(output_dict)
df['USE'] = USE(output_dict)
df['Fast_BERT'] = fast_bert(output_dict)

save_result(df)

print("=> Time taken(mins):", (time.time() - start)/60)

- source embeddings created. 0.290 mins;  shape= (137698, 512)
- target embeddings created. 0.002 mins;  shape= (966, 512)
- Time taken (mins): 20.619174893697103

## Use-case of Text Similarity - Example
---
1. Get a dataset
2. Explore listed methods
3. Evaluate based on domain knowledge and judgment
4. Output best method results
---

### 1. Dataset Selection

In [None]:
## Load data

# Ideally a truth dataset could work - dataset with question pairs with a similairty value
dataset_path = "...."


df_validation = pd.read_excel(dataset_path)
df_validation = df_validation.rename(columns={"Search Term": "Utterance", "FAQ Question": "FAQ"})

# tokenize and filter sentences here:
min_words = 4
sample_size = 30
dup_count = 5        # count of each utterance

df_val = df_validation[df_validation['# Words'] >= min_words][['Utterance', 'FAQ']].dropna().reset_index(drop=True).copy()
uttFreq = FreqDist(df_val['Utterance'].tolist())
len(uttFreq)
candidate_utterances = [utt for utt, freq in uttFreq.items() if freq >= dup_count][:sample_size]

In [None]:
# create positive sample space


positive_samples = pd.DataFrame()

for utt in tqdm.tqdm(candidate_utterances):
    subset = df_val[df_val['Utterance'] == utt]
    # size for each utterance = dup count
    subset = subset[:dup_count]
    subset['Similarity'] = 1
    positive_samples = positive_samples.append(subset)

positive_samples = positive_samples.reset_index(drop=True)
positive_samples = positive_samples.rename(columns={"Utterance": "Source", "FAQ": "Target"})

In [None]:
# create negative sample space


negative_samples = pd.DataFrame()

for utt in tqdm.tqdm(candidate_utterances):
    # choose subset
    subset = df_val[df_val['Utterance'] != utt]

    # modify
    subset = shuffle(subset, random_state=7)
    subset = subset[:100].reset_index(drop=True)
    subset = subset[['FAQ']]
    subset['Utterance'] = utt

    # cal similarity using baseline method
    embedding_method = "tfidf"

    f1 = list(subset['Utterance'].unique())
    f2 = list(subset['FAQ'].unique())

    subset_dict = load_data(f1, f2, vectorizers=[embedding_method])
    file1 = pd.DataFrame.from_dict(subset_dict['file1'])
    file2 = pd.DataFrame.from_dict(subset_dict['file2'])
    file1['key'] = 0
    file2['key'] = 0
    subset_df = file1.merge(file2, how='outer').drop(columns=['key'])

    subset_df['cosine'] = tfidf_cosine(subset_dict)

    # select dissimillar pairs (threshold kept very low)
    subset = shuffle(subset_df[subset_df['cosine'] < 0.05], random_state=5)
    subset = subset[['Source', 'Target']].reset_index(drop=True)

    # size for each utterance = dup count
    subset = subset[:dup_count]
    subset['Similarity'] = 0
    negative_samples = negative_samples.append(subset)

negative_samples = negative_samples.reset_index(drop=True)

In [None]:
## final dataset ready for computation


validation = positive_samples.append(negative_samples).sort_values(by=['Source', 'Similarity'], ascending=[True, False]).reset_index(drop=True)

### 2. Exploring methods

In [None]:
def run_validation(validation_set):

    # files
    f1 = list(validation_set['Source'].unique())
    f2 = list(validation_set['Target'].unique())

    # validation dict
    val_dict = load_data(f1, f2, vectorizers=['count', 'tfidf', 'word2vec', 'fasttext', 'homemade_fasttext',
                                              'pretrained_glove', 'pretrained_google', 'pretrained_fasttext',
                                              'word2vec_glove', 'word2vec_google'])

    # validation df
    file1 = pd.DataFrame.from_dict(val_dict['file1'])
    file2 = pd.DataFrame.from_dict(val_dict['file2'])
    file1['key'] = 0
    file2['key'] = 0
    df_val = file1.merge(file2, how='outer').drop(columns=['key'])

    # insert: truth values
    scores = []
    for s, t in zip(df_val['Source'], df_val['Target']):
        truth_set = validation_set[(validation_set['Source'] == s) & (validation_set['Target'] == t)]
        if len(truth_set) > 0:
            sim_score = truth_set['Similarity'].values[0]  # 0 or 1
        else:
            sim_score = "none"
        scores.append(sim_score)
    df_val['Similarity'] = scores


    # SIMILARITY TECHNIQUES

    df_val['BLEU'] = bleu(val_dict)
    df_val['lev'] = levenshtein(val_dict)
    df_val['tfidf_euclidean'] = tfidf_euclidean(val_dict)
    df_val['tfidf_cosine'] = tfidf_cosine(val_dict)
    df_val['jaccard'] = jaccard(val_dict)

    df_val['cos_word2vec'] = embedding_cosine(val_dict, vectorizer="word2vec")
    df_val['cos_fasttext'] = embedding_cosine(val_dict, vectorizer="fasttext")
    df_val['cos_homemade_fasttext'] = embedding_cosine(val_dict, vectorizer="homemade_fasttext")
    df_val['cos_preglove'] = embedding_cosine(val_dict, vectorizer="pretrained_glove")
    df_val['cos_pregoogle'] = embedding_cosine(val_dict, vectorizer="pretrained_google")
    df_val['cos_prefasttext'] = embedding_cosine(val_dict, vectorizer="pretrained_fasttext")
    df_val['cos_w2v_glove'] = embedding_cosine(val_dict, vectorizer="word2vec_glove")
    df_val['cos_w2v_google'] = embedding_cosine(val_dict, vectorizer="word2vec_google")

    df_val['cos_SIF_word2vec'] = embedding_sif_cosine(val_dict, vectorizer="word2vec")
    df_val['cos_SIF_fasttext'] = embedding_sif_cosine(val_dict, vectorizer="fasttext")
    df_val['cos_SIF_homemade_fasttext'] = embedding_sif_cosine(val_dict, vectorizer="homemade_fasttext")
    df_val['cos_SIF_preglove'] = embedding_sif_cosine(val_dict, vectorizer="pretrained_glove")
    df_val['cos_SIF_pregoogle'] = embedding_sif_cosine(val_dict, vectorizer="pretrained_google")
    df_val['cos_SIF_prefasttext'] = embedding_sif_cosine(val_dict, vectorizer="pretrained_fasttext")
    df_val['cos_SIF_w2v_glove'] = embedding_sif_cosine(val_dict, vectorizer="word2vec_glove")
    df_val['cos_SIF_w2v_google'] = embedding_sif_cosine(val_dict, vectorizer="word2vec_google")

    df_val['spacy_SIF_cosine'] = spacy_sif_cosine(val_dict)

    df_val['wmd_word2vec'] = embeddings_WMD(val_dict, vectorizer="word2vec")
    df_val['wmd_fasttext'] = embeddings_WMD(val_dict, vectorizer="fasttext")
    df_val['wmd_homemade_fasttext'] = embeddings_WMD(val_dict, vectorizer="homemade_fasttext")
    df_val['wmd_preglove'] = embeddings_WMD(val_dict, vectorizer="pretrained_glove")
    df_val['wmd_pregoogle'] = embeddings_WMD(val_dict, vectorizer="pretrained_google")
    df_val['wmd_prefasttext'] = embeddings_WMD(val_dict, vectorizer="pretrained_fasttext")
    df_val['wmd_w2v_glove'] = embeddings_WMD(val_dict, vectorizer="word2vec_glove")
    df_val['wmd_w2v_google'] = embeddings_WMD(val_dict, vectorizer="word2vec_google")

    # USE
    df_val['USE'] = USE(val_dict)

    ## LDA
    ## not-used: df_val['LDA'] = LDA_jensen_shannon(val_dict)

    # ATTN LSTM
    df_val['Attn_lstm'] = attention_lstm_autoencoder(val_dict)

    return val_dict, df_val

In [None]:
# Validation Dataset (along with equal sim values)
validation

In [None]:
val_dict, df_val = run_validation(validation)

### 3. Evaluation

In [None]:
final_df_val = df_val[df_val['Similarity'] != 'none'].reset_index(drop=True).copy()

In [None]:
threshold = [0.20, 0.30, 0.40, 0.45, 0.50, 0.55, 0.58, 0.60, 0.62, 0.64, 0.66, 0.68, 0.70, 0.75, 0.80, 0.90, 0.95]

## find threshold for accuracy
acc_scores = {}
for col in df_val.columns[7:]:
    list_score = []
    for thr in threshold:
        final_df_val["acc_{}".format(col)] = final_df_val[col].apply(lambda x: 1 if x > thr else 0)
        score = accuracy_score(final_df_val['Similarity'].tolist(), final_df_val["acc_{}".format(col)].tolist())
        list_score.append(score)
    acc_scores["acc_{}".format(col)] = list_score

colors = list("rgbcmyk")
for x,y in acc_scores.items():
    plt.figure(figsize=(12,5))
    plt.title(x)
    plt.scatter(threshold, y)
plt.show()

In [None]:
final_threshold = 0.50

# Display accuracies using baseline threshold
for col in df_val.columns[7:]:
    final_df_val["acc_{}".format(col)] = final_df_val[col].apply(lambda x: 1 if x > final_threshold else 0)
    score = accuracy_score(final_df_val['Similarity'].tolist(), final_df_val["acc_{}".format(col)].tolist())
    print(col, "-->", score)

In [None]:
# SAVING REPORT...

final_df_val.to_excel("...")

### 4. Display results for the best method

In [None]:
def embed(input):
    return use_model(input)

def plot_similarity(labels, features, rotation):
    corr = np.inner(features, features)
    sns.set(font_scale=2.3)
    fig, ax = plt.subplots(figsize=(15, 15))
    g = sns.heatmap(corr, xticklabels=labels, yticklabels=labels, vmin=0, vmax=1, cmap="YlOrRd", ax=ax)
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")

def run_and_plot(output_dict):
    source_sents = output_dict['file1']['Source']
    source_embeds = output_dict['file1']['Source_processed']
    target_sents = output_dict['file2']['Target']
    target_embeds = output_dict['file2']['Target_processed']

    message_embeddings_ = embed(source_embeds + target_embeds)
    plot_similarity(source_sents + target_sents, message_embeddings_, 90)

In [None]:
source_examples = [
    'close acc',
    'how do I open account',
    'what IRA stands for']

In [None]:
target_examples = [
    'How do I close an account?',
    'Documents required for opening account',
    'What is IRA?']

In [None]:
run_and_plot(load_data(source_examples, target_examples, vectorizers=['count']))

----