# Answer Clustering

To test and compare our dual encoders in a real setup we cannot use all the possible answers in our train set as candidate answers during prediction. For that reason in this notebook we will cluster the answers and select a smaller number of exemplars from those clusters.

In [129]:
# Define which dataset you want to cluster
dataset = "pinterest"

In [130]:
# Load data
import numpy as np
import pickle
word2ix =  pickle.load(open('../data/'+dataset+'/tmp/word2ix.pkl', 'rb'))
trn_q_vecs, trn_a_vecs, trn_y = pickle.load(open('../data/'+dataset+'/tmp/de_train.pkl', 'rb'))

We will consider only unique answers....

In [53]:
train_answers = trn_a_vecs[trn_y == 1]
unique_ans = [tuple(row) for row in train_answers]
unique_ans = np.unique(unique_ans)
possible_ans = [np.array(ans) for ans in unique_ans]

In [54]:
possible_ans[0]

array([   1,    7,   57,  306,    7, 1618,    6,   10,  121,  363,    4,
         96,   69,   66,   16,    8,   67,   87,   50,  306])

### Transform vectorized documents back into words:

In [55]:
def devectorize(vec_docs, word2ix):
    ix2word = {v:k for k,v in word2ix.items()}
    docs = []
    for vec in vec_docs:
        docs.append([ix2word[ix] for ix in vec])
    return docs

In [56]:
documents = devectorize(possible_ans, word2ix)

In [57]:
print (documents[0])

['_UNK_', 'i', 'am', 'maggie', 'i', 'responded', 'to', 'your', 'other', 'ticket', '.', 'let', 'us', 'know', 'if', 'you', 'need', 'more', 'help', 'maggie']


## K-Means++
Now that we have our documents back we will use K-Means++ to group similar answers.

In [58]:
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans, MiniBatchKMeans
from tqdm import tqdm_notebook as tqdm
import numpy as np
import math

class AnswerClustering(object):
    """ Class that runs the K-Means++ in a set of answers. """
    def __init__(self, answers, matrix, vocabulary, max_iter=300, n_init=10):
        self.answers = answers
        self.matrix = matrix
        self.vocabulary = vocabulary
        self.max_iter = max_iter
        self.n_init= n_init

        # centroid-features matrix containing the centroids values per line
        self.centroids = None
        # document-centroid matrix
        self.distances_to_centroids = None
        # array where entry i contains the index of the answer closest to the centroid i
        self.closest_document = None
        # array where entry i contains the centroid that was assigned to i
        self.predictions = None
        
    def clustering(self, n_clusters, minibatch=False):
        """ Runs K-Means++ from the document-feature matrix loaded at initialization time. """
        self.k = n_clusters
        if minibatch:
            self.model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=self.max_iter, n_init=self.n_init, init_size=500)
        else:    
            self.model = KMeans(n_clusters=n_clusters, max_iter=self.max_iter, n_init=self.n_init)
        # document-centroid matrix
        self.predictions = self.model.fit_predict(self.matrix)
        centroids = self.model.cluster_centers_
        self.closest_document = np.zeros(self.k)
        for i in tqdm(range(self.k)):
            min_dist = math.inf
            ans_idx = None
            centroid_vec = centroids[i]
            for j in range (len(self.answers)):
                ans_vec = self.matrix[j, :]
                distance = euclidean_distances([centroid_vec], ans_vec)[0][0]
                if self.predictions[j] == i and distance < min_dist:
                    min_dist = distance
                    ans_idx = j
            self.closest_document[i] = ans_idx
        
    def evaluate(self):
        """ The silhouette_score gives the average value for all the samples.
            This gives a perspective into the density and separation of the formed clusters.
        """
        self.silhouette_avg = silhouette_score(self.matrix, self.predictions)
        self.sample_silhouette_values = silhouette_samples(self.matrix, self.predictions)
        self.clusters_avg_silhouette = np.zeros(self.k)
        for i in range(self.k):
            ith_cluster_silhouette_values = self.sample_silhouette_values[self.predictions == i]
            self.clusters_avg_silhouette[i] = ith_cluster_silhouette_values.mean()
        return (self.silhouette_avg, self.clusters_avg_silhouette, self.sample_silhouette_values)

    def find_best_k(self, k_min, k_max, minibatch=False):
        """ Runs K-Means for several K values in order to find the one that maximizes the silhouette average score.
            Also creates a plot with the silhouette score per K and ends by running a final K-Means for the best K values.
        """
        silhouette_values = np.zeros(k_max - k_min)
        for k in tqdm(range(k_min, k_max)):
            self.clustering(k, minibatch)
            silhouette_avg = silhouette_score(self.matrix, self.predictions)
            silhouette_values[k - k_min] = silhouette_avg
        best_k = silhouette_values.argmax() + k_min

        self.clustering(best_k, minibatch)
        plt.plot([k for k in range(k_min, k_max)], silhouette_values)
        plt.ylim((-1, 1))
        plt.ylabel('Silhouette Clustering Score')
        plt.xlabel('Values of K')
        plt.show()
        
    def dump_data_to_dict(self):
        """ Bluids a dict that associates to each cluster a list of attributes and then saves it.
            (e.g: clusters[i] is a list containing the elements inside cluster i and some information about the cluster)
        """
        clusters = []
        order_centroids = self.model.cluster_centers_.argsort()[:, ::-1]
        for label in range(self.k):
            cluster_answers = []            
            for i in range(len(self.predictions)):
                if self.predictions[i] == label:
                    cluster_answers.append(self.answers[i])
            keywords = []
            for idx in order_centroids[label, :5]:
                keywords.append(self.vocabulary[idx])

            # we consider only clusters with silhouette score higher then 0 and with more than 30 representatives.
            cluster_info = {"size": len(cluster_answers), "id":label}
            cluster_info["silhouette_avg"] = self.clusters_avg_silhouette[label].item()
            cluster_info["keywords"] = keywords
            cluster_info["answers"] = cluster_answers
            cluster_info["representative"] = [self.answers[self.closest_document[label].astype(np.int64)]]
            clusters.append(cluster_info)
        self.cluster_data = clusters
        return self.cluster_data

## Preprocessing
Our documents are prepared to be used by a neural network but since we plan to cluster the answers with a simple TF-IDF feature extractor we first need to clean some things.

In [59]:
text_docs = [' '.join(sample) for sample in documents]

In [60]:
text_docs[0]

'_UNK_ i am maggie i responded to your other ticket . let us know if you need more help maggie'

In [61]:
import nltk
import re
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

def normalize_texts(docs):
    norm_docs = []
    for s in docs:
        s = re.sub(r'_UNK_', '', s)
        s = re.sub(r'_URL_', '', s)
        s = re.sub(r'_EOT_', '', s)
        s = re.sub(r'_USERID_', '', s)
        norm_docs.append(s)
    return norm_docs

def POSfiltering(docs):
    """ Function that will apply a POS tagger and filter only verbs, nouns, adjectives and adverbs.
    """
    pos_docs = []
    print ("Applying POS filtering:")
    for doc in tqdm((docs)):
        filtered_doc = []
        tokens = nltk.tokenize.word_tokenize(doc)
        tokens = nltk.pos_tag(tokens, tagset='universal')
        for token in tokens:
            if token[1] == "NOUN" or token[1] == "VERB" or token[1] == "ADV" or token[1] == "ADJ":
                filtered_doc.append(token[0])
        pos_docs.append(" ".join(filtered_doc))
    return pos_docs

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ricardorei/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/ricardorei/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [62]:
norm_text_docs = normalize_texts(text_docs)
pos_text_docs = POSfiltering(norm_text_docs)

Applying POS filtering:


HBox(children=(IntProgress(value=0, max=15612), HTML(value='')))




In [63]:
print (text_docs[0])
print (norm_text_docs[0])
print (pos_text_docs[0])

_UNK_ i am maggie i responded to your other ticket . let us know if you need more help maggie
 i am maggie i responded to your other ticket . let us know if you need more help maggie
i am maggie i responded other ticket let know need more help maggie


### Feature Extraction:
Before running the K-Means++ we ned to represent our documents in a feature space. We will simply use a TF-IDF feature space.

In [64]:
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, strip_accents='ascii')

In [65]:
doc2vec = vectorizer.fit_transform(pos_text_docs)

In [66]:
vocabulary = vectorizer.get_feature_names()

In [67]:
vocabulary[20:30]

['absolutely',
 'abstract',
 'abuse',
 'abut',
 'ac',
 'acc',
 'acccount',
 'acceder',
 'accedere',
 'accelerated']

We still have some words that could be stemmed but I believe this is enough for creating well defined clusters.

In [68]:
clustering = AnswerClustering([' '.join(sample) for sample in documents], doc2vec, vocabulary)

In [69]:
clustering.clustering(1000, minibatch=False)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [70]:
print ("Silhouette score for the {}-Means++: {}".format(clustering.k, clustering.evaluate()[0]))

Silhouette score for the 1000-Means++: 0.13219783416168693


In [71]:
json_data = clustering.dump_data_to_dict()

In [72]:
import json
with open("../data/{}/clustering.json".format(dataset), 'w') as output_file:
    json.dump({"clustering_quality": clustering.silhouette_avg.item(), "data": clustering.cluster_data}, output_file)

## Create Answer pool:
Now that we have our data clustered we will create an answer pool with the representative of each cluster. This means that we need to prepare 1000 different answers to be used during inference by our dual encoders.


In [131]:
# Load json file
import json
json_data = json.loads(open('../data/{}/clustering.json'.format(dataset), 'r').read())

In [132]:
ans_pool = [cluster["representative"][0] for cluster in json_data["data"]]

In [133]:
len(ans_pool)

1000

In [134]:
ans_pool[0]

'hi _NAME_ it s perfectly understandable that you re very upset about what s happened . please give us some more time to troubleshoot this issue and i will keep you posted on this as soon as possible . appreciate your patience ! thanks samm'

### Answer pool vectorization:

In [135]:
def vectorize(docs, vocab):
    vec_docs = []
    for doc in docs:
        vec_doc = []
        for o in doc:
            try:
                if o != ' ':
                    vec_doc.append(vocab[o])
            except KeyError:
                vec_doc.append(vocab["_UNK_"])
        vec_docs.append(vec_doc)
    return np.array(vec_docs)

In [136]:
ans_pool_toks = [answer.split(' ') for answer in ans_pool]

In [137]:
print (ans_pool_toks[0])

['hi', '_NAME_', 'it', 's', 'perfectly', 'understandable', 'that', 'you', 're', 'very', 'upset', 'about', 'what', 's', 'happened', '.', 'please', 'give', 'us', 'some', 'more', 'time', 'to', 'troubleshoot', 'this', 'issue', 'and', 'i', 'will', 'keep', 'you', 'posted', 'on', 'this', 'as', 'soon', 'as', 'possible', '.', 'appreciate', 'your', 'patience', '!', 'thanks', 'samm']


In [138]:
ans_pool_vecs = vectorize(ans_pool_toks, word2ix)

In [139]:
print (ans_pool_vecs[0])

[31, 38, 20, 59, 1781, 5603, 29, 8, 44, 153, 1440, 95, 61, 59, 474, 4, 41, 480, 69, 84, 87, 159, 6, 1657, 13, 120, 9, 7, 92, 282, 8, 758, 21, 13, 49, 278, 49, 186, 4, 465, 10, 263, 39, 113, 1357]


In [140]:
pickle.dump(ans_pool_vecs, open('../data/{}/tmp/ans_pool.pkl'.format(dataset), 'wb'))