In [None]:
import sklearn

import weaviate
from weaviate.classes.config import Configure
import weaviate.classes.config as wc
import weaviate.classes.query as wq
from weaviate.classes.query import MetadataQuery, Filter
import numpy as np
from tqdm.auto import tqdm
import faiss
import torch
from sklearn.cluster import MiniBatchKMeans
import json
from collections import defaultdict, Counter

In [None]:
client = weaviate.connect_to_local()

In [None]:
triples = client.collections.get("Triples")

In [None]:
count = 0
all_vectors = []
all_texts = []
all_ids = []
for t in tqdm(triples.iterator(include_vector=True)):
    count += 1
    all_ids.append(t.uuid)
    all_texts.append(t.properties["text"])
    all_vectors.append(t.vector["default"])

In [None]:
x = np.array(all_vectors)

In [None]:
# 50k
kmeans2 = MiniBatchKMeans(n_clusters=50000,
                         random_state=42,
                         batch_size=256*32,
                         max_iter=5,
                         n_init="auto").fit(x)

In [None]:
# 100k
kmeans3 = MiniBatchKMeans(n_clusters=100000,
                         random_state=42,
                         batch_size=256*32,
                         max_iter=10,
                         n_init="auto").fit(x)

In [None]:
# 200k
kmeans4 = MiniBatchKMeans(n_clusters=200000,
                         random_state=42,
                         batch_size=256*32,
                         max_iter=10,
                         n_init="auto").fit(x)

In [None]:
with open("data/clusters/50k.json", 'w') as out:
    json.dump(kmeans2.cluster_centers_.tolist(), out, indent=1)

In [None]:
with open("data/clusters/100k.json", 'w') as out:
    json.dump(kmeans3.cluster_centers_.tolist(), out, indent=1)

In [None]:
with open("data/clusters/200k.json", 'w') as out:
    json.dump(kmeans4.cluster_centers_.tolist(), out, indent=1)

In [None]:
c = Counter(kmeans2.labels_)
labels = [c[k] for k in sorted(c)]
with open("data/clusters/50k_counts.json", 'w') as out:
    json.dump(labels, out, indent=1)

In [None]:
c = Counter()
for i in range(100000):
    c[i] = 0
c.update(kmeans3.labels_)
labels = [c[k] for k in sorted(c)]
with open("data/clusters/clusters/clusters/clusters/100k_counts.json", 'w') as out:
    json.dump(labels, out, indent=1)

In [None]:
c = Counter()
for i in range(200000):
    c[i] = 0
c.update(kmeans4.labels_)
labels = [c[k] for k in sorted(c)]
with open("data/clusters/200k_counts.json", 'w') as out:
    json.dump(labels, out, indent=1)

In [None]:
i = 0
for id, b, c, d in tqdm(zip(all_ids, kmeans2.labels_, kmeans3.labels_, kmeans4.labels_), total=len(all_ids)):    
    props = {
        "fiftyk": int(b),
        "hundredk": int(c),
        "twohundredk": int(d)
    }
    triples.data.update(uuid=id, properties=props)
    i += 1

In [None]:
client.close()