# This is a simple exploration of the train label 

* Train a simple Word2Vec model for 2 dimensions and get the average embeddings of each the labels
* Cluster them by DBSCAN
* Visualize each cluster using scatter plot

In [None]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.cluster import KMeans, DBSCAN

import plotly.offline as pyo
import plotly.express as px

In [None]:
train_df = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
train_df

# Form corpus

In [None]:
# Frame the corpus which is the unique train labels

train_unique_label = np.unique(train_df.cleaned_label.values.astype('str'))
corpus = [label.split() for label in train_unique_label]

# Word2Vec Train function

In [None]:
def train_w2vec(corpus, dim, epochs):
    '''
    Function to train a simple word2Vec model
    '''
    model = word2vec.Word2Vec(vector_size = dim, min_count= 1)
    model.build_vocab(corpus)
    model.train(corpus, epochs=epochs, total_examples=model.corpus_count)
    
    return model

In [None]:
from mxnet import nd
from mxnet.contrib import text

def download_model():
    print(text.embedding.get_pretrained_file_names('glove'))
    glove_6b50d = text.embedding.create(
        'glove', pretrained_file_name='glove.6B.50d.txt')
    return glove_6b50d

model = download_model()

# Function to Get average embeddings

In [None]:
def get_avg_label_embedding(corpus, dim, model):
    '''
    Function to get the average word embeddings when the label is more than 1 word
    '''

    label_embedding = []
    for label in corpus:
        embedding_sum = np.zeros(shape=(dim, ))
        for tok in label:
            temp = model.get_vecs_by_tokens([tok]).asnumpy().reshape(-1)

            embedding_sum += temp
            
        avg_embedding = embedding_sum/len(label)
        label_embedding.append(avg_embedding.tolist())
    
    label_embedding = np.array(label_embedding)
    
    return label_embedding

# Train and get embedding

In [None]:
# Train the word2Vec model for few epochs
dim = 50
label_embedding = get_avg_label_embedding(corpus, dim, model)

label_embedding.shape

In [None]:
def euclidean_distance(A, B, axis=None):
    """ A, B are array or matrix
    """
    return np.sqrt( np.power(A - B, 2).sum(axis=axis) )

def rand_k_centroids(data, k=3):
    m, n = np.shape(data)
    centroids = np.mat( np.zeros((k, n)) )

    min_j = np.min(data, axis=0)
    rang_j = np.max(data, axis=0) - min_j

    centroids = np.tile(min_j, (k, 1)) + \
        np.multiply(np.tile(rang_j, (k, 1)),
                    np.random.rand(k, n))

#    for j in range(n):
#        min_j   = np.min(data[:, j])
#        range_j = np.max(data[:, j]) - min_j
#        centroids[:, j] = min_j + range_j * np.random.rand(k, 1)
    return centroids

def kmeans(data, k, calculate_distance=euclidean_distance, create_centroids=rand_k_centroids):
    """ k means only converge to local minimum,
        the result will easily affect by initial centroids
        Theoretically, the result of clustering will shake, but happens rarely.
        two problems:
            1. one cluster can split. which one?
                a. the cluster with biggest SSE(sum of squared error) until cluster growth to k;
                b. after the cluster splitting, sum of all clusters' SSE is minimum
            2. two cluster can merge. which two?
                a. two nearest centroids;
                b. after the two centroids merging, sum of all clusters' SSE is minimum
        After analysing the two problems, we got bisecting k means.
    """
    m, n = np.shape(data)
    # initial with -1, in case the compare changed conflict
    # 簇分配矩阵 [assign data points to a centroid, holds Sum of Squared Error to each point]
    cluster_assignment = np.mat( -np.ones((m, 2)) )
    centroids = create_centroids(data, k)
    changed = True

    while changed:
        changed = False

        for i in range(m): # for each data point, assign it to the closest centroid
            min_distance = -1; min_index = -2

            for j in range(k):
                distance = calculate_distance( data[i, :], centroids[j, :] )
                if distance < min_distance or min_distance == -1:
                    min_distance = distance; min_index = j

            if cluster_assignment[i, 0] != min_index: changed = True
            cluster_assignment[i, :] = (min_index, min_distance**2)

        for centre in range(k): # recalculate centroids
            points_in_cluster = data[ np.nonzero(cluster_assignment[:, 0].A == centre)[0] ] # get all points in this cluster
            # assign centroid to mean of all points in this cluster
            # if points_in_cluster is empty, this centroid is np.nan, warnings is printed
            centroids[centre, :] = np.mean(points_in_cluster, axis=0)
    return centroids, cluster_assignment

def bisecting_kmeans(data, k, calculate_distance=euclidean_distance):
    """ We start with one cluster, split it to k.
        Split the cluster which can decrease the SSE(sum of squared error) most.
        converge at global minimum
    """
    m, n = np.shape(data)
    centroid = np.mean(data, axis=0).tolist()[0]
    centroids = [centroid] # list with one centroid

    cluster_assignment = np.mat( np.zeros((m, 2)) )
    for i in range(m): # calculate initial SSE
        cluster_assignment[i, 1] = calculate_distance(centroid, data[i, :]) ** 2

    while(len(centroids) < k):
        lowest_SSE = -1
        for i, _ in enumerate(centroids):
            subcluster = data[np.nonzero(cluster_assignment[:, 0].A == i)[0], :] # get all points in cluster i
            # this cluster has no point, if deleted this centroid, centroids length may never reach k
            # if len(subcluster) == 0: del(centroids[i]); continue

            subcentroids, subcluster_assignment = kmeans(subcluster, 2, calculate_distance)
            if np.any( np.isnan(subcentroids) ) == True:
                # Do 2means again if one cluster has no point, or else will generate zero point centroid
                # that means this subcluster don't need to be split
                subcentroids, subcluster_assignment = kmeans(subcluster, 2, calculate_distance)
                if np.isnan( np.sum(subcentroids) ) == True:
                    continue

            subSSE = np.sum( subcluster_assignment[:, 1] )
            non_subSSE = np.sum( cluster_assignment[np.nonzero(cluster_assignment[:, 0].A != i)[0], 1] )
            if (subSSE + non_subSSE) < lowest_SSE or lowest_SSE == -1:
                lowest_SSE = subSSE + non_subSSE
                best_split_centre = i
                best_subcentroids = subcentroids
                best_subcluster_assignment = subcluster_assignment

        if lowest_SSE == -1: break # no suitable split in centroids
        print("{} centroids SSE: {}".format(len(centroids)+1, lowest_SSE))
        # len(centroids) larger than any index in centroids, need use to assign first
        # if best_split_centre is 0, or len(centroids) is 1, they can affect best_subcluster_assignment without intermediate value
        row1 = np.nonzero(best_subcluster_assignment[:, 0].A == 1)[0]
        row0 = np.nonzero(best_subcluster_assignment[:, 0].A == 0)[0]
        best_subcluster_assignment[row1, 0] = len(centroids)
        best_subcluster_assignment[row0, 0] = best_split_centre
        cluster_assignment[np.nonzero(cluster_assignment[:, 0].A == best_split_centre)[0], :] = best_subcluster_assignment

        centroids[best_split_centre] = best_subcentroids[0].tolist()[0] # replace a centroid with two better centroids
        centroids.append(best_subcentroids[1].tolist()[0])
    return np.mat(centroids), cluster_assignment

# Cluster the data using kmeans

In [None]:
centroids, ca = bisecting_kmeans(label_embedding, 100)

print(f"centroids : {centroids}[{centroids.shape}], ca: {ca}[{ca.shape}]")

# Visualize
-> However over the points for name of the dataset

In [None]:
# Set notebook mode to work in offline
pyo.init_notebook_mode()

# Create traces
trace0 = px.scatter(x = label_embedding[:, 0], 
                    y = label_embedding[:, 1], 
                    color = ca[:, 0].astype(np.int32).reshape(-1).tolist()[0], 
                    hover_name=train_unique_label
)

trace0.show()

With a quick hover on the points it looks like the cluster are good and similar domains are grouped together with few data points which are not clustered right. However you can play around with it by trying

* Different epochs for Word2Vec, (tried)
* Train for more Dimensions and reduce them using t-sne or PCA
* Play with different cluster distance mertrics for DBSCAN clusters or different cluster algorithms (tried)

The label embedding is the average of all the word embedding in this label. Don't know whether this method can represent the label embedding?

If the answer is yes, I think the conlusion is the labels can not be clustered.