In [1]:
import random

import numpy as np
import pandas as pd
from pycleora import SparseMatrix


In [2]:

# Generate example data
customers = [f"Customer_{i}" for i in range(1, 20)]
products = [f"Product_{j}" for j in range(1, 20)]

data = {
    "customer": random.choices(customers, k=100),
    "product": random.choices(products, k=100),
}

# Create DataFrame
df = pd.DataFrame(data)

# Create hyperedges
customer_products = df.groupby('customer')['product'].apply(list).values


In [10]:
df.head()

Unnamed: 0,customer,product
0,Customer_17,Product_14
1,Customer_15,Product_2
2,Customer_7,Product_14
3,Customer_16,Product_6
4,Customer_8,Product_16


In [3]:
df[df["customer"]=="Customer_1"]

Unnamed: 0,customer,product
61,Customer_1,Product_9
63,Customer_1,Product_11
93,Customer_1,Product_16
94,Customer_1,Product_10
98,Customer_1,Product_5


In [4]:
customer_products[0]

['Product_9', 'Product_11', 'Product_16', 'Product_10', 'Product_5']

In [5]:

# Convert to Cleora input format
cleora_input = map(lambda x: ' '.join(x), customer_products)


In [6]:

# Create Markov transition matrix for the hypergraph
mat = SparseMatrix.from_iterator(cleora_input, columns='complex::reflexive::product')

# Look at entity ids in the matrix, corresponding to embedding vectors
print(mat.entity_ids)
# ['Product_5', 'Product_3', 'Product_2', 'Product_4', 'Product_1']

# Initialize embedding vectors externally, using text, image, random vectors
# embeddings = ...

# Or use built-in random deterministic initialization
embeddings = mat.initialize_deterministically(1024)



['Product_9', 'Product_11', 'Product_16', 'Product_10', 'Product_5', 'Product_2', 'Product_13', 'Product_17', 'Product_3', 'Product_1', 'Product_19', 'Product_14', 'Product_7', 'Product_18', 'Product_6', 'Product_4', 'Product_15', 'Product_8', 'Product_12']


In [7]:
# Perform Markov random walk, then normalize however many times we want

NUM_WALKS = 3   # The optimal number depends on the graph, typically between 3 and 7 yields good results
                # lower values tend to capture co-occurrence, higher iterations capture substitutability in a context

for i in range(NUM_WALKS):
    # Can propagate with a symmetric matrix as well, but left Markov is a great default
    embeddings = mat.left_markov_propagate(embeddings)
    # Normalize with L2 norm by default, for the embeddings to reside on a hypersphere. Can use standardization instead.
    embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True)

# We're done, here are our embeddings

for entity, embedding in zip(mat.entity_ids, embeddings):
    print(entity, embedding)

# We can now compare our embeddings with dot product (since they are L2 normalized)


Product_9 [-0.06257404  0.06375309 -0.00484251 ...  0.09930901 -0.02396393
 -0.01025797]
Product_11 [-0.04680848  0.05156246 -0.00165294 ...  0.09317391 -0.01833735
  0.00064596]
Product_16 [-0.03287907  0.03268932  0.01764641 ...  0.09368931 -0.02287238
  0.00015796]
Product_10 [-0.02613788  0.02963978  0.02689312 ...  0.08956334 -0.02288911
  0.00383349]
Product_5 [-0.04588095  0.0446549  -0.00884243 ...  0.09252467 -0.01239008
 -0.00271966]
Product_2 [-0.03039401  0.03658994  0.00766213 ...  0.0897934  -0.01633884
  0.00652312]
Product_13 [-0.0225957   0.03074767  0.02951849 ...  0.08883275 -0.02475371
  0.00871953]
Product_17 [-0.01534201  0.02004776  0.03644207 ...  0.08440638 -0.02676416
  0.01135574]
Product_3 [-0.01789637  0.02054818  0.01917115 ...  0.08761455 -0.01536821
  0.00852353]
Product_1 [-0.00723398  0.01748507  0.02216135 ...  0.08479065 -0.01095192
  0.00181495]
Product_19 [-0.01972033  0.02514308  0.01831969 ...  0.08596507 -0.01668355
  0.01013268]
Product_14 [-0.

In [8]:

print(np.dot(embeddings[0], embeddings[1]))
print(np.dot(embeddings[0], embeddings[2]))
print(np.dot(embeddings[0], embeddings[3]))

0.953177
0.8523312
0.84724545
