## Task 1: Maximum Path Length of Association (MPLA)

### A. Generate Graph

In [5]:
import networkx as nx
import matplotlib.pyplot as plt
import random

NUM_CONCEPTS = 20
EDGE_PROB = 0.3 

def generate_graph(NUM_CONCEPTS=20, EDGE_PROB=0.3, plot_graph=False):
    """
    Generate a directed graph with a specified number of concepts and edges.
    """
    # Create directed graph
    G = nx.DiGraph()

    # Number of concepts
    NUM_CONCEPTS = 20
    EDGE_PROB = 0.3 
    total_edges = 0

    # Add concepts to graph
    concepts = [f"C{i+1}" for i in range(NUM_CONCEPTS)]
    G.add_nodes_from(concepts)

    # Add edges (relations) to graph
    for i in range(NUM_CONCEPTS):
        for j in range(i + 1, NUM_CONCEPTS):
            if random.random() < EDGE_PROB:
                G.add_edge(concepts[i], concepts[j])
                total_edges += 1
            
    if plot_graph:
        # Plot graph
        pos = nx.spring_layout(G)
        nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray')
        plt.title("MPLA Concept Graph Example")
        plt.show()

    return G, total_edges

# Create directed graph
G, total_edges = generate_graph(NUM_CONCEPTS=NUM_CONCEPTS, EDGE_PROB=EDGE_PROB, plot_graph=False)
concepts = list(G.nodes)
relations = list(G.edges)

In [33]:
def get_longest_path(G, source, target):
    # Get all simple (non-cyclic) paths from source to target
    paths = list(nx.all_simple_paths(G, source=source, target=target))
    
    # No valid path
    if not paths:
        return None, 0 
    
    # Sort paths by length and pick the longest
    longest_path = max(paths, key=len)
    return longest_path, len(longest_path) - 1  # length in edges


def is_valid_path(G, path):
    for i in range(len(path) - 1):
        if not G.has_edge(path[i], path[i + 1]):
            return False
    return True

In [40]:
source = f"C{1}"
target = f"C{10}"

path, length = get_longest_path(G, source, target)

if path:
    print(f"Longest path from {source} to {target}: {path}")
    print(f"Path length: {length}")
    print(f"Is this a valid path? {is_valid_path(G, path)}")
else:
    print(f"No valid path from {source} to {target}")

Longest path from C1 to C10: ['C1', 'C2', 'C3', 'C8', 'C10']
Path length: 4
Is this a valid path? True


### B. Generate Data

In [None]:
import string
import tqdm
import numpy as np

DATA_ROOT = "/Users/samuelschapiro/Desktop/Spiral Works/theory-for-the-emergence-of-creativity-in-llms/graph-paths-novelty/ntp/creativity_data"

NUM_CONCEPTS = 10
HASH_STR_LEN = 10
EDGE_PROB = 0.3

# AI Generated - need to test
def build_dataset(num_concepts, edge_prob, hash_str_len):
    """
    Build a dataset of sequences with concepts and their relations.
    """
    G, total_edges = generate_graph(num_concepts, edge_prob, plot_graph=False)
    # Create entity vocabulary
    concepts = [f"<C{i+1}>" for i in range(num_concepts)]
    concept_vocab = {concept: i for i, concept in enumerate(concepts)}
    
    # Generate sequences
    train_sequences = []
    test_sequences = []
    
    for _ in tqdm.tqdm(range(1000)):
        sequence_length = np.random.randint(5, 15)
        sequence = np.random.choice(concepts, size=sequence_length, replace=True)
        train_sequences.append(sequence)
    
    for _ in tqdm.tqdm(range(200)):
        sequence_length = np.random.randint(5, 15)
        sequence = np.random.choice(concepts, size=sequence_length, replace=True)
        test_sequences.append(sequence)

    # Create dictionaries for entities
    entities_b1_dict = {f"<b1_{i}>": f"<b1_{i}>" for i in range(num_concepts)}
    entities_b2_dict = {f"<b2_{i}>": f"<b2_{i}>" for i in range(num_concepts)}

    return concept_vocab, train_sequences, test_sequences, entities_b1_dict, entities_b2_dict

In [None]:
import string
import tqdm
import numpy as np

DATA_ROOT = "/Users/samuelschapiro/Desktop/Spiral Works/theory-for-the-emergence-of-creativity-in-llms/graph-paths-novelty/ntp/creativity_data"

# OLD CODE FROM SIBLING DISCOVERY
NUM_A = 10
NUM_B_PER_A = 1000
HASH_STR_LEN = 10
def build_dataset(num_a, num_b_per_a, hash_str_len):
 
    entities_a = ["<a_{}>".format(i) for i in range(num_a)]

    entities_b1 = ["<b1_{}>".format(i) for i in range(num_b_per_a * num_a)]
    entities_b2 = ["<b2_{}>".format(i) for i in range(num_b_per_a * num_a)]

    entity_vocab = entities_a + entities_b1 + entities_b2

    entities_b1_dict = { entity_a: [entities_b1[i * num_b_per_a + j] for j in range(num_b_per_a)] for i, entity_a in enumerate(entities_a) }
    entities_b2_dict = { entity_a: [entities_b2[i * num_b_per_a + j] for j in range(num_b_per_a)] for i, entity_a in enumerate(entities_a) }

    # Instead of generating all indices at once, generate hash strings directly
    chars = string.ascii_lowercase + string.digits
    base = len(chars)
    used_hashes = set()  # Keep track of used hash strings
    
    train_sequences, test_sequences = [], []
    for entity_a in tqdm(entities_a):
        entities_b1 = entities_b1_dict[entity_a]
        entities_b2 = entities_b2_dict[entity_a]
        for b1 in tqdm(entities_b1):
            for b2 in entities_b2:
                # Generate a unique hash string
                if hash_str_len == 0:
                    hash_str = ""
                else:
                    while True:
                        # Generate random digits and convert to hash string
                        hash_digits = [random.randint(0, base-1) for _ in range(hash_str_len)]
                        hash_str = ''.join(chars[d] for d in hash_digits)
                        if hash_str not in used_hashes:
                            used_hashes.add(hash_str)
                            break
                
                if np.random.uniform() > 0.005:
                    train_sequences.append(form_creativity(hash_str, entity_a, b1, b2))
                else:
                    test_sequences.append(form_creativity_test(hash_str, entity_a, b1, b2))
    
    return entity_vocab, train_sequences, test_sequences, entities_b1_dict, entities_b2_dict



entity_vocab, train_sequences, test_sequences, entities_b1_dict, entities_b2_dict = build_dataset(NUM_A, NUM_B_PER_A, HASH_STR_LEN)