In [3]:
import numpy as np
import pandas as pd

In [2]:
DATA_DIR = './data/'

In [129]:
def hamming_distance(v_1, v_2):
    return np.sum(np.array(v_1) != np.array(v_2))

def hamming_matrix(vs):
    n = len(vs)
    m = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            m[i, j] = hamming_distance(vs[i], vs[j])
            m[j, i] = m[i, j]
    return m

In [10]:
def cluster_data(
        num_cats,
        cat_dims,
        num_centroids,
        centroid_multiplicity_min,
        centroid_multiplicity_max,
):
    clusters = []
    centroids = []
    for _ in range(num_centroids):
        centroid = []
        for cat_idx in range(num_cats):
            centroid.append(np.random.randint(low=0,  high=cat_dims[cat_idx]))
        centroids.append(centroid)
        centroid_multiplicity = np.random.randint(centroid_multiplicity_min, centroid_multiplicity_max)
        cluster = []
        for _ in range(centroid_multiplicity):
            cluster.append(centroid)
        clusters.append(cluster)
    return centroids, clusters

def fanout_cluster_data(
        centroids,
        clusters,
        num_cats,
        cats_dims,
        num_fanouts_max,
        radius
):  
    fanout_clusters = clusters.copy()
    for cluster_idx, (centroid, cluster) in enumerate(zip(centroids, clusters)):
        num_fanouts = np.random.randint(low=1, high=num_fanouts_max)
        for _ in range(num_fanouts):
            fanout = centroid.copy()
            hamming_distance = np.random.randint(low=1, high=radius)
            change_indices = np.random.choice(num_cats, hamming_distance, replace=False)
            for cat_idx in change_indices:
                fanout[cat_idx] = np.random.randint(low=0, high=cats_dims[cat_idx])
            fanout_clusters[cluster_idx].append(fanout)
    return fanout_clusters
           


In [110]:
num_cats = 10
cat_dims = np.random.randint(10, 1000, num_cats)
num_centroids = 100
centroid_multiplicity_min = 5
centroid_multiplicity_max = 15

num_fanouts_max = 500
radius = 3

In [111]:
centroids, clusters = cluster_data(
        num_cats,
        cat_dims,
        num_centroids,
        centroid_multiplicity_min,
        centroid_multiplicity_max,
)

In [112]:
fanout_clusters = fanout_cluster_data(
        centroids,
        clusters,
        num_cats,
        cat_dims,
        num_fanouts_max,
        radius
)

In [113]:
labelled_data = [fanout + [fanout_cluster_idx] for (fanout_cluster_idx, fanout_cluster) in enumerate(fanout_clusters) for fanout in fanout_cluster]

In [114]:
columns = [f'cat_{i}' for i in range(num_cats)] + ['cluster_idx']
df = pd.DataFrame(labelled_data, columns=columns)
df = df.sample(frac=1)
df.to_csv(DATA_DIR + 'fanout_data.csv', index=False)

In [115]:
df.head()

Unnamed: 0,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cluster_idx
14945,155,482,541,24,462,46,239,143,250,312,55
26381,328,469,80,189,171,34,283,533,81,377,98
7942,75,753,272,7,396,24,258,393,137,47,28
15428,287,330,306,239,324,67,148,582,253,151,57
26460,328,469,80,189,394,34,283,328,81,5,98


In [116]:
df = pd.read_csv(DATA_DIR + 'fanout_data.csv')
df.shape

(27143, 11)