In [10]:
import math
import numpy as np
import pickle

In [11]:
# From https://stackoverflow.com/questions/64890117/what-is-the-best-way-to-generate-all-binary-strings-of-the-given-length-in-pytho
def generate_binary(bit_count):
    binary_strings = []
    def genbin(n, bs=''):
        if len(bs) == n:
            binary_strings.append(bs)
        else:
            genbin(n, bs + '0')
            genbin(n, bs + '1')


    genbin(bit_count)
    return [bitstr_to_tuple(bs) for bs in binary_strings]

def bitstr_to_tuple(bitstr):
    return tuple(int(elem) for elem in bitstr)

In [16]:
# Number of data points
NUM_DATA = 1000
# Number of samples from the same concept Gaussian ( i.e. 'modalities' )
NUM_SAMPLES = 5 # For now?
# Standard deviation of generated Gaussian distributions
SEP = 0.5 # 0.5,1.0,2.0 with 100 concepts
# Number of concepts
NUM_CONCEPTS = 64 # 2,10,100,1000 with 1.0 std here
# Number of paired data per concept
NUM_PAIRED_DATA = 5

DIMENSION = 32
CONCEPT_DIM = 32

# Generate Concept Means
concept_dim = 7
concept_means = generate_binary(concept_dim)

# Generate means of concepts, so that they're spaced out evenly a la sklearn.datasets.make_classification [
# I. Guyon, 'Design of experiments for the NIPS 2003 variable selection benchmark', 2003.]
TFs = []
for i in range(NUM_SAMPLES):
    TFs.append(np.random.normal(0.0,1.0,(CONCEPT_DIM, DIMENSION)))

# Transform from concepts to concept_means
TFSTAR = np.random.uniform(0.0,1.0,(concept_dim, CONCEPT_DIM))


total_raw_data = []

for modality_idx in range(NUM_SAMPLES):
    temp_list = []
    for concept_idx in range(NUM_CONCEPTS):
        sample_set = np.random.multivariate_normal(concept_means[concept_idx],np.eye(concept_dim)*SEP, (NUM_DATA,))
        temp_list.append(sample_set)
    total_raw_data.append(temp_list)
X_train = np.array(total_raw_data).transpose((2,1,0,3)).reshape((-1,NUM_SAMPLES, concept_dim))
y_train = np.tile([i for i in range(NUM_CONCEPTS)], NUM_DATA)

total_test_data = []

for modality_idx in range(NUM_SAMPLES):
    temp_list = []
    for concept_idx in range(NUM_CONCEPTS):
        sample_set = np.random.multivariate_normal(concept_means[concept_idx],np.eye(concept_dim)*SEP, (NUM_DATA,))
        temp_list.append(sample_set)
    total_test_data.append(temp_list)
X_test = np.array(total_test_data).transpose((2,1,0,3)).reshape((-1,NUM_SAMPLES, concept_dim))
y_test = np.tile([i for i in range(NUM_CONCEPTS)], NUM_DATA)


data = dict()
data['train'] = dict()
data['test'] = dict()
keys = ['a','b','c','d','e']
for i, k in enumerate(keys):
    data['train'][k] = X_train[:,i]@TFSTAR@TFs[i]
    data['test'][k] = X_test[:,i]@TFSTAR@TFs[i]
data['train']['label'] = y_train
data['test']['label'] = y_test


with open(f"REDUNDANT_CONCEPTS_{NUM_CONCEPTS}.pickle", 'wb') as f:
  print(f"REDUNDANT_CONCEPTS_{NUM_CONCEPTS}.pickle")
  pickle.dump(data, f)

REDUNDANT_CONCEPTS_64.pickle
