In [1]:
%load_ext autoreload
%autoreload 2

# Compress cluster and super-cluster files

Read the existing cluster and super-cluster files, reduce the number of decimals in the centroids and write them in gzip format in order to keep them below 100Mb.

In [4]:
import gzip
import json

import numpy as np

from nama.data.filesystem import download_file_from_s3, save_file

In [3]:
# config
# TODO run both given and surname
given_surname = "given"
# given_surname = "surname"

n_decimals = 8

linkage = "average"
similarity_threshold = 0.10 if given_surname == "given" else 0.25
cluster_freq_normalizer = "none"

augmented_clusters_path = f"s3://fs-nama-data/2024/nama-data/data/processed/clusters_{given_surname}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}-augmented.json"
super_clusters_path = f"s3://fs-nama-data/2024/nama-data/data/processed/super_clusters_{given_surname}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}.json"

## Load data

In [5]:
path = download_file_from_s3(augmented_clusters_path) if augmented_clusters_path.startswith("s3://") else augmented_clusters_path
with open(path, 'r') as f:
    clusters = json.load(f)  # cluster label -> names, centroid

path = download_file_from_s3(super_clusters_path) if super_clusters_path.startswith("s3://") else super_clusters_path
with open(path, 'r') as f:
    super_clusters = json.load(f)  # cluster label -> names, centroid

## Reduce centroid decimal places

In [6]:
for cluster in clusters.values():
    cluster['centroid'] = np.around(np.array(cluster['centroid']), decimals=n_decimals).tolist()

## Save gzipped

In [7]:
save_file(f"{augmented_clusters_path}.gz",
          lambda local_out_path : json.dump(clusters, gzip.open(local_out_path, 'wt', encoding='UTF-8')))

save_file(f"{super_clusters_path}.gz",
          lambda local_out_path : json.dump(super_clusters, gzip.open(local_out_path, 'wt', encoding='UTF-8')))