In [1]:
import sys
import os

import os
cwd = os.getcwd()
root_path = os.path.abspath('..')
sys.path.insert(0, root_path)

In [2]:
import numpy as np

def assign_to_buckets(input_lengths, bucket_boundaries):
    """
    Assign each input length to a bucket.
    
    Args:
        input_lengths: A list or array of input lengths.
        bucket_boundaries: A list of upper limits for each bucket. 
                           For example: [10, 20, 30] means we have buckets for lengths <= 10, 11-20, and 21-30.
    
    Returns:
        A list of bucket indices where each input belongs.
    """
    bucket_boundaries_size = len(bucket_boundaries)
    bucket_ids = []
    for length in input_lengths:
        if length > bucket_boundaries[-1]:
            bucket_ids.append(bucket_boundaries_size)   

        for i, boundary in enumerate(bucket_boundaries):
            if length <= boundary:
                bucket_ids.append(i)
                break
        
    return bucket_ids

# Example usage
input_lengths = [5, 12, 18, 25, 30, 35]
bucket_boundaries = [10, 20, 30]  # Define length ranges
bucket_ids = assign_to_buckets(input_lengths, bucket_boundaries)
print("Bucket assignments:", bucket_ids)

Bucket assignments: [0, 1, 1, 2, 2, 3]


In [3]:
import numpy as np

# Given arrays
arrays = [
    np.array([ 0.03655883,  0.02535131,  0.03378846,  0.00381433,  0.03175445,
               -0.01702683, -0.00473201,  0.02884287, -0.03760819, -0.01968052,
               -0.03755791, -0.00465021,  0.04769059, -0.03659583, -0.01166884,
               -0.00968871,  0.04038718, -0.02965448,  0.00022581, -0.02376867],
              dtype=np.float32),
    np.array([-0.04801775,  0.02503647, -0.04379793, -0.02195913, -0.0001755 ,
               -0.00148091, -0.0383062 ,  0.04807372,  0.02491029,  0.04616572,
               -0.04078959,  0.02247899, -0.02068538,  0.00412268,  0.0424931 ,
               -0.02231088,  0.0225875 , -0.0339348 , -0.01774244,  0.04699254],
              dtype=np.float32),
    np.array([-0.00788826,  0.00160686, -0.02070315, -0.03841344, -0.00754004,
                0.01234897, -0.00444013,  0.02766831, -0.01371489,  0.01130033,
                0.02727897,  0.04172977, -0.0072687 , -0.04604071,  0.02185276,
                0.00285892,  0.03720954, -0.00406641, -0.01319207, -0.04376505],
              dtype=np.float32)
]

# Stack the arrays vertically
stacked_arrays = np.vstack(arrays)

# Calculate the mean along the first axis (across the arrays)
averaged_values = np.mean(stacked_arrays, axis=0)

print(averaged_values)

[-0.00644906  0.01733155 -0.01023754 -0.01885275  0.00801297 -0.00205292
 -0.01582611  0.03486164 -0.00880426  0.01259518 -0.01702284  0.01985285
  0.00657884 -0.02617129  0.01755901 -0.00971356  0.03339474 -0.0225519
 -0.01023623 -0.00684706]


In [4]:
# Define the shape
shape = (2, 3, 4)

# Create an empty array of the desired shape
array = np.empty(shape, dtype=object)

# Populate the array with string representations of the indices
for i in range(shape[0]):
    for j in range(shape[1]):
        for k in range(shape[2]):
            array[i, j, k] = f'({i},{j},{k})'  # Use formatted string for the indices

# Print the resulting array
print(array)

[[['(0,0,0)' '(0,0,1)' '(0,0,2)' '(0,0,3)']
  ['(0,1,0)' '(0,1,1)' '(0,1,2)' '(0,1,3)']
  ['(0,2,0)' '(0,2,1)' '(0,2,2)' '(0,2,3)']]

 [['(1,0,0)' '(1,0,1)' '(1,0,2)' '(1,0,3)']
  ['(1,1,0)' '(1,1,1)' '(1,1,2)' '(1,1,3)']
  ['(1,2,0)' '(1,2,1)' '(1,2,2)' '(1,2,3)']]]


In [5]:
# [ 
#     [ 
#         ['(0,0,0)' '(0,0,1)' '(0,0,2)' '(0,0,3)'] # Each of the 
#         ['(0,1,0)' '(0,1,1)' '(0,1,2)' '(0,1,3)']
#         ['(0,2,0)' '(0,2,1)' '(0,2,2)' '(0,2,3)']
#     ]
#     [
#         ['(1,0,0)' '(1,0,1)' '(1,0,2)' '(1,0,3)']
#         ['(1,1,0)' '(1,1,1)' '(1,1,2)' '(1,1,3)']
#         ['(1,2,0)' '(1,2,1)' '(1,2,2)' '(1,2,3)']
#     ]
# ]

# (dim1, dim0 * dim2)
# # so output should be (3, 2*4)
# '(0,0,0)' '(0,0,1)' '(0,0,2)' '(0,0,3)' '(1,0,0)' '(1,0,1)' '(1,0,2)' '(1,0,3)'
# '(0,1,0)' '(0,1,1)' '(0,1,2)' '(0,1,3)' '(1,1,0)' '(1,1,1)' '(1,1,2)' '(1,1,3)'
# '(0,2,0)' '(0,2,1)' '(0,2,2)' '(0,2,3)' '(1,2,0)' '(1,2,1)' '(1,2,2)' '(1,2,3)'

In [6]:
dim0, dim1, dim2 = array.shape
flat_numeric_features = np.reshape(array, (dim1, dim0 * dim2), order='F')

print(flat_numeric_features)

[['(0,0,0)' '(1,1,0)' '(0,0,1)' '(1,1,1)' '(0,0,2)' '(1,1,2)' '(0,0,3)'
  '(1,1,3)']
 ['(1,0,0)' '(0,2,0)' '(1,0,1)' '(0,2,1)' '(1,0,2)' '(0,2,2)' '(1,0,3)'
  '(0,2,3)']
 ['(0,1,0)' '(1,2,0)' '(0,1,1)' '(1,2,1)' '(0,1,2)' '(1,2,2)' '(0,1,3)'
  '(1,2,3)']]


In [7]:
# Step 1: Transpose the array to swap dim0 and dim1
transposed_array = np.transpose(array, (1, 0, 2))

# Step 2: Reshape in C-order (row-major)
flat_numeric_features = np.reshape(transposed_array, (dim1, dim0 * dim2), order='C')

print(flat_numeric_features)

[['(0,0,0)' '(0,0,1)' '(0,0,2)' '(0,0,3)' '(1,0,0)' '(1,0,1)' '(1,0,2)'
  '(1,0,3)']
 ['(0,1,0)' '(0,1,1)' '(0,1,2)' '(0,1,3)' '(1,1,0)' '(1,1,1)' '(1,1,2)'
  '(1,1,3)']
 ['(0,2,0)' '(0,2,1)' '(0,2,2)' '(0,2,3)' '(1,2,0)' '(1,2,1)' '(1,2,2)'
  '(1,2,3)']]


In [8]:
# Generate 12 unique values (since 2*2*3 = 12)
unique_values = np.arange(24)

# Reshape the array to the desired shape (2, 2, 3)
array = unique_values.reshape(2, 3, 4)

In [9]:
# Create an array of shape (5, 10, 13)
array = np.zeros((5, 3, 4))

# Set values for each slice along axis 0
for i in range(array.shape[0]):
    array[i, :, :] = i  # Set all values at axis 0 [i] to i

In [10]:
array

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[3., 3., 3., 3.],
        [3., 3., 3., 3.],
        [3., 3., 3., 3.]],

       [[4., 4., 4., 4.],
        [4., 4., 4., 4.],
        [4., 4., 4., 4.]]])

In [11]:
array.shape

(5, 3, 4)

In [12]:
transposed_array = np.transpose(array, (1, 0, 2))

In [13]:
transposed_array

array([[[0., 0., 0., 0.],
        [1., 1., 1., 1.],
        [2., 2., 2., 2.],
        [3., 3., 3., 3.],
        [4., 4., 4., 4.]],

       [[0., 0., 0., 0.],
        [1., 1., 1., 1.],
        [2., 2., 2., 2.],
        [3., 3., 3., 3.],
        [4., 4., 4., 4.]],

       [[0., 0., 0., 0.],
        [1., 1., 1., 1.],
        [2., 2., 2., 2.],
        [3., 3., 3., 3.],
        [4., 4., 4., 4.]]])

In [14]:
transposed_array.shape

(3, 5, 4)

In [15]:
dim0, dim1, dim2 = transposed_array.shape
flat_transposed_array = np.reshape(transposed_array, (dim0, dim1 * dim2), order='F')

In [16]:
flat_transposed_array

array([[0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0.,
        1., 2., 3., 4.],
       [0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0.,
        1., 2., 3., 4.],
       [0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0.,
        1., 2., 3., 4.]])

In [17]:
# Generate 12 unique values (since 2*2*3 = 12)
unique_values = np.arange(6)

# Reshape the array to the desired shape (2, 2, 3)
array_2 = unique_values.reshape(3, 2)

In [18]:
array_2

array([[0, 1],
       [2, 3],
       [4, 5]])

In [19]:
array_concatinate = np.concatenate((flat_transposed_array,array_2), axis=1)

In [20]:
array_concatinate

array([[0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0.,
        1., 2., 3., 4., 0., 1.],
       [0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0.,
        1., 2., 3., 4., 2., 3.],
       [0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0., 1., 2., 3., 4., 0.,
        1., 2., 3., 4., 4., 5.]])

In [21]:
# Create an array of shape (5, 10, 13)
array = np.zeros((5, 3, 4))

# Set values for each slice along axis 0
for i in range(array.shape[0]):
    array[i, :, :] = i  # Set all values at axis 0 [i] to i

print(array)

[[[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[2. 2. 2. 2.]
  [2. 2. 2. 2.]
  [2. 2. 2. 2.]]

 [[3. 3. 3. 3.]
  [3. 3. 3. 3.]
  [3. 3. 3. 3.]]

 [[4. 4. 4. 4.]
  [4. 4. 4. 4.]
  [4. 4. 4. 4.]]]


In [22]:
bucket_boundaries = list(range(3,10,4))

In [23]:
bucket_boundaries

[3, 7]

In [24]:
from utils.enums import EncodingCategorical

enc = EncodingCategorical.WORD_2_VEC_ATC
categorical_encoding = EncodingCategorical.items_short()[enc]

In [25]:
categorical_encoding

'W2V-ATC'