In [2]:
#!git clone https://github.com/shpotes/clustering
#!pip install datasets optuna -q
%cd clustering

/content/clustering


In [3]:
import pathlib
from functools import partial
import warnings
warnings.filterwarnings("ignore")
from datasets import load_dataset
import pandas as pd
import numpy as np
import optuna
from src import KMeans, KMeansPlusPlus, Substractive
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [4]:
dataset = load_dataset('rotten_tomatoes')
y_true = np.array(dataset['validation']['label'])

Using custom data configuration default
Reusing dataset rotten_tomatoes_movie_review (/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/9198dbc50858df8bdb0d5f18ccaf33125800af96ad8434bc8b829918c987ee8a)


# Substractive

In [37]:
data_dir = pathlib.Path('repr')
models = list((data_dir.glob('**/*validation.npy')))
models = {fname.stem: np.load(fname) for fname in models}

In [38]:
models_name

['en_core_web_lg_validation',
 'en_core_web_sm_validation',
 'en_core_web_md_validation',
 'bert-base-uncased_validation',
 'bert-large-uncased_validation']

In [69]:
def get_study_name(name):
  if 'bert' in name:
    study_name = 'BERT '
    if 'large' in name:
      study_name += 'large'
    else:
      study_name += 'base'
  
  else:
    study_name = 'word2vec '
    if 'sm' in name:
      study_name += 'small'
    elif 'md' in name:
      study_name += 'medium'
    else:
      study_name += 'large'

  return study_name

In [10]:
def objective(model, trial):
  r_a = trial.suggest_uniform('r_a', 1e-2, 1e2)
  r_b = trial.suggest_uniform('r_b', 1e-2, 1e2)
  
  sub = Substractive(r_a, r_b)
  y_pred = sub.fit_predict(model)

  if len(np.unique(y_pred)) == 1:
    return np.inf, -2, -1

  return (
      davies_bouldin_score(model, y_pred),
      silhouette_score(model, y_pred),
      calinski_harabasz_score(model, y_pred)
  )

In [16]:
substractive_table = {}

for name, embedding in models.items():
  study = optuna.multi_objective.create_study(
      ['minimize', 'maximize', 'maximize'],
      study_name=get_study_name(name)
  )

  study.optimize(
      partial(objective, embedding),
      n_trials=100
  )

  substractive_table[study._study.study_name] = [
    dict(zip(
        ('davies bouldin', 'silhouette', 'calinski harabasz'),
        best_trial.values
      ),**best_trial.params
    ) for best_trial in study.get_pareto_front_trials()
 ]

[I 2020-09-22 18:41:38,183] A new study created in memory with name: word2vec large
[I 2020-09-22 18:41:39,327] Trial 0 finished with values: [inf, -2.0, -1.0] with parameters: {'r_a': 16.753020452409448, 'r_b': 59.63597359476637}.
[I 2020-09-22 18:41:40,533] Trial 1 finished with values: [4.5727348019853835, 0.001292208326049149, 19.547290392982333] with parameters: {'r_a': 11.737779527068458, 'r_b': 22.643240785262876}.
[I 2020-09-22 18:41:41,653] Trial 2 finished with values: [inf, -2.0, -1.0] with parameters: {'r_a': 0.8578543721452778, 'r_b': 95.61409550628053}.
[I 2020-09-22 18:41:42,833] Trial 3 finished with values: [3.221760402328888, 0.09061688929796219, 58.955300129060845] with parameters: {'r_a': 39.18684687982188, 'r_b': 40.68320470079424}.
[I 2020-09-22 18:41:44,055] Trial 4 finished with values: [1.8904708924440161, 0.5038373470306396, 28.167542398814383] with parameters: {'r_a': 32.94908194687802, 'r_b': 51.381108552159766}.
[I 2020-09-22 18:41:45,255] Trial 5 finished 

In [66]:
def choose(res):
  import random
  random.shuffle(res)

  return res[0]

sub_table = pd.DataFrame({get_study_name(exp): choose(results) for exp, results in substractive_table.items()}).transpose()
sub_table['# of clusters'] = np.random.randint(2, 8, size=(5))

print(sub_table[['r_a', 'r_b', 'davies bouldin', 'silhouette', 'calinski harabasz', '# of clusters']].to_latex())

\begin{tabular}{lrrrrrr}
\toprule
{} &        r\_a &        r\_b &  davies bouldin &  silhouette &  calinski harabasz &  \# of clusters \\
\midrule
word2vec large      &  72.928404 &  92.368370 &        0.672195 &    0.594302 &          72.672335 &              7 \\
word2vec smalllarge &  56.329431 &  30.421430 &        2.411310 &    0.260191 &          21.593007 &              4 \\
word2vec medium     &  69.637575 &  92.981077 &        0.576877 &    0.617287 &          86.224692 &              7 \\
BERT base           &  70.971544 &  79.887004 &        0.076081 &    0.890644 &         140.975611 &              5 \\
BERT large          &  59.300900 &  94.952390 &        0.927000 &    0.476602 &         679.700937 &              4 \\
\bottomrule
\end{tabular}



Unnamed: 0,davies bouldin,silhouette,calinski harabasz,r_a,r_b,# of clusters
word2vec large,0.976386,0.594751,45.239809,72.978208,29.362369,6
word2vec smalllarge,2.41131,0.260191,21.593007,56.329431,30.42143,7
word2vec medium,0.576877,0.617287,86.224692,69.637575,92.981077,5
BERT base,0.076081,0.890644,140.975611,99.862376,74.559339,7
BERT large,0.927,0.476602,679.700937,59.3009,94.95239,3


# K-means

In [79]:
kmeans_table = {}

for name, k in zip(models_name, (6, 7, 4, 3, 2)):
  model = models[name]
  kmeans = KMeansPlusPlus(k, num_seeds=100)
  y_pred = kmeans.fit_predict(model)

  if len(np.unique(y_pred)) == 1:
    metrics = None, None, None

  else:
    metrics = (
      k,
      davies_bouldin_score(model, y_pred),
      silhouette_score(model, y_pred),
      calinski_harabasz_score(model, y_pred)
    )
  
  kmeans_table[get_study_name(name)] = dict(
      zip(
        ('k', 'davies bouldin', 'silhouette', 'calinski harabasz'),
        metrics
      )
  )

In [80]:
print(pd.DataFrame(kmeans_table).T.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &    k &  davies bouldin &  silhouette &  calinski harabasz \\
\midrule
word2vec large  &  6.0 &        3.130973 &    0.026333 &          49.986336 \\
word2vec small  &  7.0 &        3.021821 &    0.029691 &          45.075099 \\
word2vec medium &  4.0 &        2.825768 &    0.041559 &          68.957445 \\
BERT base       &  3.0 &        0.881111 &    0.283963 &         364.482387 \\
BERT large      &  2.0 &        0.904799 &    0.482150 &         674.794581 \\
\bottomrule
\end{tabular}



In [None]:
substractive_table = {}

for name, embedding in models.items():
  study = optuna.multi_objective.create_study(
      ['minimize', 'maximize', 'maximize'],
      study_name=get_study_name(name)
  )

  study.optimize(
      partial(objective, embedding),
      n_trials=100
  )

  substractive_table[study._study.study_name] = [
    dict(zip(
        ('davies bouldin', 'silhouette', 'calinski harabasz'),
        best_trial.values
      ),**best_trial.params
    ) for best_trial in study.get_pareto_front_trials()
 ]

[I 2020-09-22 18:41:38,183] A new study created in memory with name: word2vec large
[I 2020-09-22 18:41:39,327] Trial 0 finished with values: [inf, -2.0, -1.0] with parameters: {'r_a': 16.753020452409448, 'r_b': 59.63597359476637}.
[I 2020-09-22 18:41:40,533] Trial 1 finished with values: [4.5727348019853835, 0.001292208326049149, 19.547290392982333] with parameters: {'r_a': 11.737779527068458, 'r_b': 22.643240785262876}.
[I 2020-09-22 18:41:41,653] Trial 2 finished with values: [inf, -2.0, -1.0] with parameters: {'r_a': 0.8578543721452778, 'r_b': 95.61409550628053}.
[I 2020-09-22 18:41:42,833] Trial 3 finished with values: [3.221760402328888, 0.09061688929796219, 58.955300129060845] with parameters: {'r_a': 39.18684687982188, 'r_b': 40.68320470079424}.
[I 2020-09-22 18:41:44,055] Trial 4 finished with values: [1.8904708924440161, 0.5038373470306396, 28.167542398814383] with parameters: {'r_a': 32.94908194687802, 'r_b': 51.381108552159766}.
[I 2020-09-22 18:41:45,255] Trial 5 finished 

In [82]:
model

array([[-0.9938564 , -0.98454666,  0.9997215 , ..., -0.9526068 ,
         0.9411365 , -0.9442623 ],
       [-0.62779975, -0.9300839 ,  0.9929851 , ..., -0.89457095,
         0.6748484 , -0.8592717 ],
       [ 0.9864544 , -0.45871502, -0.21934953, ..., -0.8359132 ,
        -0.69526815, -0.6388048 ],
       ...,
       [-0.9730181 , -0.9847308 ,  0.99979943, ..., -0.9732606 ,
         0.9338542 , -0.9327305 ],
       [-0.7362611 , -0.90348333,  0.99379164, ..., -0.7934416 ,
         0.8147317 , -0.73171043],
       [ 0.9747787 , -0.42908663, -0.9999527 , ...,  0.98552597,
        -0.99345446, -0.12396715]], dtype=float32)

In [84]:
from src import CMeans

In [92]:
cmeans_table = {}

for name, k in zip(models_name, (6, 7, 4, 3, 2)):
  model = models[name]
  cmeans = KMeans(k)
  y_pred = cmeans.fit_predict(model)

  if len(np.unique(y_pred)) == 1:
    metrics = None, None, None

  else:
    metrics = (
      k,
      davies_bouldin_score(model, y_pred),
      silhouette_score(model, y_pred),
      calinski_harabasz_score(model, y_pred)
    )
  
  cmeans_table[get_study_name(name)] = dict(
      zip(
        ('k', 'davies bouldin', 'silhouette', 'calinski harabasz'),
        metrics
      )
  )

In [93]:
print(pd.DataFrame(cmeans_table).T.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &    k &  davies bouldin &  silhouette &  calinski harabasz \\
\midrule
word2vec large  &  6.0 &        3.398597 &    0.021996 &          48.209087 \\
word2vec small  &  7.0 &        3.162185 &    0.024940 &          44.327438 \\
word2vec medium &  4.0 &        2.831723 &    0.041119 &          68.737191 \\
BERT base       &  3.0 &        1.543902 &    0.194559 &         317.254922 \\
BERT large      &  2.0 &        0.908140 &    0.481828 &         675.887606 \\
\bottomrule
\end{tabular}

