### General

In [1]:
from google.colab import drive
drive.mount("/content/drive")

# Here is the path of the root dir of this folder in your google drive
path="/content/drive/My Drive/Project"

import os
import sys
os.chdir(path)
sys.path.append(path)

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import json
import random
import copy
import h5py
import math
from tqdm import tqdm
tqdm.pandas()

In [3]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.4 MB/s[0m eta [36m0:00:0

In [5]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


### CODER embed entity

In [None]:
from transformers import AutoTokenizer, AutoModel

coder_tokenizer = AutoTokenizer.from_pretrained("GanjinZero/coder_eng_pp")
coder_model = AutoModel.from_pretrained("GanjinZero/coder_eng_pp").to(device)
coder_model.output_hidden_states = False

In [None]:
batch_size = 160

# Best CODER results are with [CLS] representations and normalization (default)
def get_bert_embed(phrase_list, model, tokenizer, normalize=True, summary_method="CLS"):

  # TOKENIZATION
  input_ids = []
  for phrase in phrase_list:
    # (1) Tokenize the sentence.
    # (2) Prepend the `[CLS]` token to the start.
    # (3) Append the `[SEP]` token to the end.
    # (4) Map tokens to their IDs.
    # (5) Pad or truncate the sentence to `max_length`
    # (6) Create attention masks for [PAD] tokens.
    input_ids.append(tokenizer(
        phrase,
        max_length=32, # UMLS terms are short
        add_special_tokens=True,
        truncation=True,
        pad_to_max_length=True)['input_ids'])

  # INFERENCE MODE ON
  model.eval()

  # COMPUTE EMBEDDINGS ACCORDING TO THE SPECIFIED BATCH-SIZE
  # (e.g., max_length=32, batch_size=64 --> 2 phrase embeddings at a time)
  count = len(input_ids) # n total tokens
  now_count = 0
  with torch.no_grad():
    while now_count < count:
      batch_input_gpu = torch.LongTensor(input_ids[
          now_count:min(now_count + batch_size, count)]).to(device)
      if summary_method == "CLS":
        embed = model(batch_input_gpu)[1]
      if summary_method == "MEAN":
        embed = torch.mean(model(batch_input_gpu)[0], dim=1)
      if normalize:
        embed_norm = torch.norm(
            embed, p=2, dim=1, keepdim=True).clamp(min=1e-12)
        embed = embed / embed_norm
      # Move embedding on CPU and convert it to a numpy array
      embed_np = embed.cpu().detach().numpy()
      # Update indeces for batch processing
      if now_count == 0:
        output = embed_np
      else:
        output = np.concatenate((output, embed_np), axis=0)
      now_count = min(now_count + batch_size, count)
  return output

#### save result

In [None]:
with open(path + "data/build_tri/build_entities.json", "r") as json_file:
    build_entities = json.load(json_file)
    entities = list(build_entities.values())

In [None]:
entities_feat = get_bert_embed(entities, coder_model, coder_tokenizer)

In [None]:
embed_file = '/data/coder_embed_entity.h5'
#write
with h5py.File(path + embed_file, 'w') as hf:
    hf.create_dataset('entity', data=entities.to_numpy())
    hf.create_dataset('embedding', data=np.array(entities_feat, dtype=np.float64))

### Cluster

In [7]:
with h5py.File(path + embed_file, 'r') as hf:
    # read
    strings_data = hf['entity'][:]
    arrays_data = hf['embedding'][:]

In [None]:
import faiss
import matplotlib.pyplot as plt

class FaissKMeans:
    def __init__(self, n_clusters=8, n_iter=20, n_redo=1):
        self.n_clusters = n_clusters
        self.n_iter = n_iter
        self.n_redo = n_redo
        self.kmeans = None
        self.cluster_centers_ = None
        self.inertia_ = None

    def fit(self, X):
        self.kmeans = faiss.Kmeans(d=X.shape[1],
                                   k=self.n_clusters,
                                   niter=self.n_iter,
                                   nredo=self.n_redo, gpu=True)
        self.kmeans.train(X.astype(np.float32))
        self.cluster_centers_ = self.kmeans.centroids
        self.inertia_ = self.kmeans.obj[-1]
    def predict(self, X):
        return self.kmeans.index.search(X.astype(np.float32), 1)[1]

def get_k(x):
    return len(arrays_data) // x

In [None]:
#find best K

# define a cluster contains how many nodes
Q = [20, 15, 10, 5]

k_values = [get_k(x) for x in Q]
niter_values = [5, 10, 20, 50, 100]

best_k = None
best_niter = None
best_model = None

best_inertia = float("inf")
inertia_results = []

for k in k_values:
    for niter in niter_values:
        f_cluster = FaissKMeans(k, niter)
        f_cluster.fit(arrays_data)

        inertia = f_cluster.inertia_
        inertia_results.append(inertia)

        print(f"clusters: {k}")
        print(f"niter: {niter}")
        print(f"inertia: {inertia}")

        if inertia < best_inertia:
            best_inertia = inertia
            best_niter = niter
            best_k = k
            best_model = f_cluster.kmeans

In [None]:
# plot
plt.plot(range(len(inertia_results)), inertia_results, marker='o')
plt.xlabel('Parameter Combination')
plt.ylabel('Inertia')
plt.title('Inertia for Different Parameter Combinations')
plt.xticks(range(len(inertia_results)), [f'({k},{niter})' for k in k_values for niter in niter_values], rotation=45)
plt.tight_layout()
plt.show()

In [None]:
print(best_k)
print(best_niter)

In [None]:
# Improve stability

nredo_values = [1, 5, 10, 15]
best_nredo = None

for nredo in nredo_values:

    f_cluster = FaissKMeans(best_k, best_niter, nredo)
    f_cluster.fit(arrays_data)

    inertia = f_cluster.inertia_
    inertia_results.append(inertia)

    print(f"nredo: {nredo}")
    print(f"inertia: {inertia}")

    if inertia < best_inertia:
        best_inertia = inertia
        best_nredo = nredo
        best_model = f_cluster.kmeans

In [None]:
# plot
plt.plot(nredo_values, inertia_results[-4:], marker='o')

plt.xlabel('nredo')
plt.ylabel('Inertia')
plt.title('Inertia for Different nredo Values')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Build a FAISS index
dim = arrays_data.shape[1]
res = faiss.StandardGpuResources()
index = faiss.GpuIndexFlatL2(res, dim)

index.add(arrays_data.astype('float32'))

In [None]:
#search for the cluster group and get the clusters
_, I = index.search (best_model.centroids, 5)

In [None]:
# from index, group the string cluster
centroids = []
clusters = []

for i, v in enumerate(I):
    #centroid = best_model.centroids[i]
    centroid = strings_data[v[0]]
    cluster = [strings_data[x] for x in v]

    centroids.append(centroid)
    clusters.append(cluster)

In [None]:
# save to h5
cluster_file = '/data/faiss_clusters.h5'

with h5py.File(path + hdf5_file, 'w') as hf:
    hf.create_dataset('centroid', data=centroids)
    hf.create_dataset('cluster', data=clusters)

### make it as memory formate   

cluster flatten

In [9]:
with h5py.File(path + cluster_file, 'r') as hf:
    # read
    centroids = hf['centroid'][:]
    clusters = hf['cluster'][:]

In [13]:
clusters = ["related entities: " + ", ".join([e.decode() for e in c]) for c in clusters]

In [15]:
c_df = pd.DataFrame(columns=['centroid', 'cluster'])

c_df['centroid'] = centroids
c_df['cluster'] = clusters

c_df.to_csv(os.path.join(path,'data/memories/clusters_memory.csv'), index_label=False)