In [7]:
!git clone https://github.com/shpotes/clustering
!pip install datasets optuna -q
%cd clustering

Cloning into 'clustering'...
remote: Enumerating objects: 33, done.
remote: Counting objects: 100% (33/33), done.
remote: Compressing objects: 100% (25/25), done.
remote: Total 184 (delta 17), reused 19 (delta 8), pack-reused 151
Receiving objects: 100% (184/184), 161.85 MiB | 37.55 MiB/s, done.
Resolving deltas: 100% (60/60), done.
Checking out files: 100% (38/38), done.
/content/clustering/clustering


In [29]:
import pathlib
from datasets import load_dataset
import pandas as pd
import numpy as np
import optuna
from src import KMeans, KMeansPlusPlus
from sklearn.metrics import mutual_info_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [24]:
dataset = load_dataset('rotten_tomatoes')
y_true = np.array(dataset['validation']['label'])

Using custom data configuration default
Reusing dataset rotten_tomatoes_movie_review (/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/9198dbc50858df8bdb0d5f18ccaf33125800af96ad8434bc8b829918c987ee8a)


# Word2vec

In [10]:
data_dir = pathlib.Path('repr')
word2vec = list((data_dir / 'word2vec').glob('*validation.npy'))

models = {fname.stem: np.load(fname) for fname in word2vec}
models_name = list(models.keys())

In [38]:
def objective_david(trial):
    k = trial.suggest_int('k', 2, 8)
    seed = trial.suggest_int('seed', 1, 1000)
    name = trial.suggest_categorical('model', models_name)
    
    sub = KMeansPlusPlus(n_clusters=8, random_state=seed, num_seeds=100)
    y_pred = sub.fit_predict(models[name])

    return davies_bouldin_score(models[name], y_pred)

In [45]:
study_david = optuna.create_study(direction='minimize')
study_david.optimize(objective_david, n_trials=1000)

[I 2020-09-22 02:02:55,861] A new study created in memory with name: no-name-4128626c-1dbf-40ec-a493-21033dabb160
[I 2020-09-22 02:02:56,199] Trial 0 finished with value: 3.2897309969601487 and parameters: {'k': 6, 'seed': 617, 'model': 'en_core_web_lg_validation'}. Best is trial 0 with value: 3.2897309969601487.
[I 2020-09-22 02:02:56,499] Trial 1 finished with value: 3.194234196100649 and parameters: {'k': 7, 'seed': 952, 'model': 'en_core_web_md_validation'}. Best is trial 1 with value: 3.194234196100649.
[I 2020-09-22 02:02:56,582] Trial 2 finished with value: 2.9387820560507922 and parameters: {'k': 2, 'seed': 862, 'model': 'en_core_web_sm_validation'}. Best is trial 2 with value: 2.9387820560507922.
[I 2020-09-22 02:02:56,679] Trial 3 finished with value: 2.964790491922224 and parameters: {'k': 7, 'seed': 844, 'model': 'en_core_web_sm_validation'}. Best is trial 2 with value: 2.9387820560507922.
[I 2020-09-22 02:02:56,776] Trial 4 finished with value: 2.929911079391787 and parame

In [40]:
def objective_silhouette(trial):
    k = trial.suggest_int('k', 2, 8)
    seed = trial.suggest_int('seed', 1, 1000)
    name = trial.suggest_categorical('model', models_name)
    
    sub = KMeansPlusPlus(n_clusters=8, random_state=seed, num_seeds=100)
    y_pred = sub.fit_predict(models[name])

    return silhouette_score(models[name], y_pred)

In [42]:
study_silhouette = optuna.create_study(direction='maximize')
study_silhouette.optimize(objective_silhouette, n_trials=1000)

[I 2020-09-22 01:51:04,178] A new study created in memory with name: no-name-f2b69ebe-9d5a-43cb-9623-48bacfc2b84f
[I 2020-09-22 01:51:04,297] Trial 0 finished with value: 0.027063345536589622 and parameters: {'k': 8, 'seed': 61, 'model': 'en_core_web_sm_validation'}. Best is trial 0 with value: 0.027063345536589622.
[I 2020-09-22 01:51:04,607] Trial 1 finished with value: 0.011948038823902607 and parameters: {'k': 7, 'seed': 753, 'model': 'en_core_web_lg_validation'}. Best is trial 0 with value: 0.027063345536589622.
[I 2020-09-22 01:51:04,949] Trial 2 finished with value: 0.006970513612031937 and parameters: {'k': 4, 'seed': 158, 'model': 'en_core_web_lg_validation'}. Best is trial 0 with value: 0.027063345536589622.
[I 2020-09-22 01:51:05,228] Trial 3 finished with value: 0.012445405125617981 and parameters: {'k': 8, 'seed': 967, 'model': 'en_core_web_md_validation'}. Best is trial 0 with value: 0.027063345536589622.
[I 2020-09-22 01:51:05,553] Trial 4 finished with value: 0.01769112

In [43]:
def objective_calinski(trial):
    k = trial.suggest_int('k', 2, 8)
    seed = trial.suggest_int('seed', 1, 1000)
    name = trial.suggest_categorical('model', models_name)
    
    sub = KMeansPlusPlus(n_clusters=8, random_state=seed, num_seeds=100)
    y_pred = sub.fit_predict(models[name])

    return calinski_harabasz_score(models[name], y_pred)

In [44]:
study_calinski = optuna.create_study(direction='maximize')
study_calinski.optimize(objective_calinski, n_trials=1000)

[I 2020-09-22 01:58:06,633] A new study created in memory with name: no-name-e9cd4a61-466e-49dc-861b-ec4d776cdeed
[I 2020-09-22 01:58:06,869] Trial 0 finished with value: 40.248449676912635 and parameters: {'k': 3, 'seed': 982, 'model': 'en_core_web_lg_validation'}. Best is trial 0 with value: 40.248449676912635.
[I 2020-09-22 01:58:07,124] Trial 1 finished with value: 39.88634054871643 and parameters: {'k': 3, 'seed': 659, 'model': 'en_core_web_lg_validation'}. Best is trial 0 with value: 40.248449676912635.
[I 2020-09-22 01:58:07,225] Trial 2 finished with value: 41.90969418011952 and parameters: {'k': 7, 'seed': 95, 'model': 'en_core_web_sm_validation'}. Best is trial 2 with value: 41.90969418011952.
[I 2020-09-22 01:58:07,489] Trial 3 finished with value: 40.34200086077329 and parameters: {'k': 3, 'seed': 773, 'model': 'en_core_web_lg_validation'}. Best is trial 2 with value: 41.90969418011952.
[I 2020-09-22 01:58:07,570] Trial 4 finished with value: 41.8967784997461 and parameters

# Transformer based models

In [47]:
transformers = list((data_dir / 'transformers').glob('*validation.npy'))

models = {fname.stem: np.load(fname) for fname in transformers}
models_name = list(models.keys())

models_name

['roberta-large_validation',
 'roberta-base_validation',
 'bert-base-uncased_validation',
 'bert-large-uncased_validation']

In [48]:
def objective_david(trial):
    k = trial.suggest_int('k', 2, 8)
    seed = trial.suggest_int('seed', 1, 1000)
    name = trial.suggest_categorical('model', models_name)
    
    sub = KMeansPlusPlus(n_clusters=8, random_state=seed, num_seeds=100)
    y_pred = sub.fit_predict(models[name])

    return davies_bouldin_score(models[name], y_pred)

In [49]:
study_david = optuna.create_study(direction='minimize')
study_david.optimize(objective_david, n_trials=1000)

[I 2020-09-22 02:08:15,061] A new study created in memory with name: no-name-e6cff9a7-121c-4ac4-9b12-29cdf11e740a
[I 2020-09-22 02:08:16,840] Trial 0 finished with value: 1.4411720392162504 and parameters: {'k': 7, 'seed': 224, 'model': 'bert-base-uncased_validation'}. Best is trial 0 with value: 1.4411720392162504.
[I 2020-09-22 02:08:17,203] Trial 1 finished with value: 1.9624709229899657 and parameters: {'k': 5, 'seed': 58, 'model': 'roberta-base_validation'}. Best is trial 0 with value: 1.4411720392162504.
[I 2020-09-22 02:08:18,991] Trial 2 finished with value: 2.4384503847171994 and parameters: {'k': 6, 'seed': 940, 'model': 'roberta-large_validation'}. Best is trial 0 with value: 1.4411720392162504.
[I 2020-09-22 02:08:19,266] Trial 3 finished with value: 1.966066764585782 and parameters: {'k': 4, 'seed': 317, 'model': 'roberta-base_validation'}. Best is trial 0 with value: 1.4411720392162504.
[I 2020-09-22 02:08:20,461] Trial 4 finished with value: 1.266255750886356 and paramet

In [50]:
def objective_silhouette(trial):
    k = trial.suggest_int('k', 2, 8)
    seed = trial.suggest_int('seed', 1, 1000)
    name = trial.suggest_categorical('model', models_name)
    
    sub = KMeansPlusPlus(n_clusters=8, random_state=seed, num_seeds=100)
    y_pred = sub.fit_predict(models[name])

    return silhouette_score(models[name], y_pred)

In [51]:
study_silhouette = optuna.create_study(direction='maximize')
study_silhouette.optimize(objective_silhouette, n_trials=1000)

[I 2020-09-22 02:27:25,106] A new study created in memory with name: no-name-67f1db5a-ed81-428b-b0d4-5ab2da15f8fb
[I 2020-09-22 02:27:25,577] Trial 0 finished with value: 0.12089690566062927 and parameters: {'k': 2, 'seed': 519, 'model': 'roberta-base_validation'}. Best is trial 0 with value: 0.12089690566062927.
[I 2020-09-22 02:27:26,917] Trial 1 finished with value: 0.2574377655982971 and parameters: {'k': 2, 'seed': 299, 'model': 'bert-large-uncased_validation'}. Best is trial 1 with value: 0.2574377655982971.
[I 2020-09-22 02:27:27,429] Trial 2 finished with value: 0.15312862396240234 and parameters: {'k': 4, 'seed': 174, 'model': 'bert-base-uncased_validation'}. Best is trial 1 with value: 0.2574377655982971.
[I 2020-09-22 02:27:27,839] Trial 3 finished with value: 0.12305694073438644 and parameters: {'k': 3, 'seed': 571, 'model': 'roberta-base_validation'}. Best is trial 1 with value: 0.2574377655982971.
[I 2020-09-22 02:27:28,354] Trial 4 finished with value: 0.1330704391002655

In [52]:
def objective_calinski(trial):
    k = trial.suggest_int('k', 2, 8)
    seed = trial.suggest_int('seed', 1, 1000)
    name = trial.suggest_categorical('model', models_name)
    
    sub = KMeansPlusPlus(n_clusters=8, random_state=seed, num_seeds=100)
    y_pred = sub.fit_predict(models[name])

    return calinski_harabasz_score(models[name], y_pred)

In [53]:
study_calinski = optuna.create_study(direction='maximize')
study_calinski.optimize(objective_calinski, n_trials=1000)

[I 2020-09-22 02:48:03,459] A new study created in memory with name: no-name-03893648-60be-449e-8cf5-d066a43cc784
[I 2020-09-22 02:48:03,790] Trial 0 finished with value: 95.98771491006711 and parameters: {'k': 8, 'seed': 883, 'model': 'roberta-large_validation'}. Best is trial 0 with value: 95.98771491006711.
[I 2020-09-22 02:48:04,102] Trial 1 finished with value: 260.0532233448073 and parameters: {'k': 4, 'seed': 818, 'model': 'roberta-base_validation'}. Best is trial 1 with value: 260.0532233448073.
[I 2020-09-22 02:48:04,425] Trial 2 finished with value: 260.4954405623195 and parameters: {'k': 3, 'seed': 880, 'model': 'roberta-base_validation'}. Best is trial 2 with value: 260.4954405623195.
[I 2020-09-22 02:48:04,695] Trial 3 finished with value: 95.08843268690481 and parameters: {'k': 6, 'seed': 893, 'model': 'roberta-large_validation'}. Best is trial 2 with value: 260.4954405623195.
[I 2020-09-22 02:48:04,964] Trial 4 finished with value: 95.30907097752255 and parameters: {'k':