In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/Colab Notebooks/TransformerECA

/content/drive/MyDrive/Colab Notebooks/TransformerECA


# CA generation

## Explore CA

In [None]:
! pip install cellpylib

In [None]:
import cellpylib as cpl
import matplotlib.pyplot as plt
import numpy as np

def generate_rule_array(r):
    # Length of the binary rule array
    rule_length = 2 ** (2 * r + 1)
    # Generating a random rule as a binary array
    rule_array = np.random.choice([0, 1], rule_length)
    return rule_array

r = 1
size = 250
T = 200

cellular_automaton = cpl.init_random(size)
rule_number = generate_rule_array(r)
rule_number = [1,1,0,0,1,1,0,0]
#rule_number = [0,1,1,0,1,1,0,0]

# rule_number = np.random.randint(2**2**(2*r+1))
# rule_number = 1349837757 # 1196030057 # 1110475378
# print(bin(rule_number)[2:])

# evolve the CA, setting r to 2, for a neighbourhood size of 5
cellular_automaton = cpl.evolve(cellular_automaton, timesteps=T, memoize=True,
                                apply_rule=lambda n, c, t: cpl.binary_rule(n, rule_number),
                                r = r)

scale = 20
plt.figure(figsize=((int) (size/scale), (int) (T/scale)))
plt.imshow(cellular_automaton, cmap='cividis', aspect='auto')
plt.show()

In [None]:
rule_number

## CA Dataset generation

In [None]:
! pip install datasets
! pip install cellpylib

/bin/bash: line 1: pip: command not found
/bin/bash: line 1: pip: command not found


In [None]:
import cellpylib as cpl
import numpy as np
import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm

def generate_ca_dataset(r, size, T, num_samples):
    """
    Generate a dataset for 1D cellular automata.

    Parameters:
    r (int): The radius of the CA rule.
    size (int): The number of cells in the CA.
    T (int): The number of steps for the CA to evolve.
    num_samples (int): The number of samples in the dataset.

    Returns:
    DataFrame: A pandas DataFrame containing the dataset.
    """
    dataset = []

    for _ in tqdm(range(num_samples), desc="Generating dataset"):
        # Generate the initial state and the rule number
        initial_state = cpl.init_random(size)
        #rule_number = np.random.randint(2 ** (2 ** (2 * r + 1)))
        rule_number = np.random.choice([0, 1], size=(2 ** (2 * r + 1))).tolist()
        #rule_number = [1,1,0,0,1,1,0,0] # rule 51
        #rule_bin = format(rule_number, f'0{2 ** (2 * r + 1)}b')
        rule_bin = ''.join(str(bit) for bit in rule_number)

        # Evolve the CA
        ca = cpl.evolve(initial_state, timesteps=T, memoize=True,
                        apply_rule=lambda n, c, t: cpl.binary_rule(n, rule_number), r=r)

        # Convert CA states to strings and add to dataset
        states = [''.join(map(str, state)) for state in ca]
        dataset.append([rule_bin, *states])

    # Create DataFrame
    columns = ['rule'] + [f't={t}' for t in range(T)]
    return pd.DataFrame(dataset, columns=columns)

In [None]:
def generate_ca_dataset_by_rules(r, size, T, num_samples, rules):
    dataset = []
    num_rules = len(rules)

    for _ in tqdm(range(num_samples), desc="Generating dataset"):
        initial_state = cpl.init_random(size)
        rule_number = np.random.choice(rules)  # Select from the specified set of rules
        rule_bin = format(rule_number, f'0{2 ** (2 * r + 1)}b')

        # Evolve the CA
        ca = cpl.evolve(initial_state, timesteps=T, memoize=True,
                        apply_rule=lambda n, c, t: cpl.binary_rule(n, rule_number), r=r)

        # Convert CA states to strings and add to dataset
        states = [''.join(map(str, state)) for state in ca]
        dataset.append([rule_bin, *states])

    # Create DataFrame
    columns = ['rule'] + [f't={t}' for t in range(T)]
    return pd.DataFrame(dataset, columns=columns)

In [None]:
def partition_rules(r, train_samples, test_samples):
    """
    Uniformly samples a subset of rules from the entire space of possible rules for a given radius 'r'.
    The number of rules sampled is limited to 'num_samples' and then partitioned for training/validation and testing.

    Parameters:
    - r (int): The radius of the CA rule, determining the rule space size.
    - num_samples (int): The upper limit on the number of unique rules to consider.
    - test_prop (float): Proportion of the rules to be used for testing.

    Returns:
    - tuple: Two numpy arrays containing rule numbers for training/validation and testing.
    """
    # Calculate the total number of possible rules
    total_rules = 2 ** (2 ** (2 * r + 1))

    # Determine the number of unique rules to sample
    num_rules = min((num_samples+test_samples), total_rules)

    # Randomly sample unique rule numbers from the total rule space
    sampled_rules = np.random.choice(total_rules, size=num_rules, replace=False)

    # Calculate the number of rules to be used for testing
    num_test_rules = test_samples #int(num_rules * test_prop)

    # Split the sampled rules into test and training/validation rule sets
    np.random.shuffle(sampled_rules)  # Ensure random distribution for partitioning
    test_rules = sampled_rules[:num_test_rules]
    train_val_rules = sampled_rules[num_test_rules:]

    return train_val_rules, test_rules

In [None]:
# Constants
r = 4
size = 20
T = 20
num_samples = 1_111_111
test_samples = 100_000

# Get the partitions for rules
#train_val_rules, test_rules = partition_rules(r, num_samples, test_samples)

# Generate the dataset
df = generate_ca_dataset(r, size, T, num_samples)
#train_val_df = generate_ca_dataset_by_rules(r, size, T, len(train_val_rules), train_val_rules)
#test_df = generate_ca_dataset_by_rules(r, size, T, len(test_rules), test_rules)

Generating dataset:   0%|          | 0/1111111 [00:00<?, ?it/s]

In [None]:
import os
import json
from datasets import load_dataset, DatasetDict

# Directory for dataset files
dataset_dir = '1dCA_r'+str(r)+'s'+str(size)+'T'+str(T)
os.makedirs(dataset_dir, exist_ok=True)

# Splitting the dataset into train, validation, and test sets
# Mixed Train\Val\Test Adjust the proportions as per your requirements
train_df = df.sample(frac=0.9, random_state=123) # 80% for training
test_df = df.drop(train_df.index)
validation_df = test_df.sample(frac=0.2, random_state=123) # 10% for validation
test_df = test_df.drop(validation_df.index) # 10% for testing

# Mixed Train\Val and Separate Test. Split train and validation
#train_df = train_val_df.sample(frac=0.95, random_state=123)  # Adjusted for 90% of train_val
#validation_df = train_val_df.drop(train_df.index)

# Saving the splits in JSON format
train_df.to_json(os.path.join(dataset_dir, 'train.json'), orient='records', lines=True)
validation_df.to_json(os.path.join(dataset_dir, 'validation.json'), orient='records', lines=True)
test_df.to_json(os.path.join(dataset_dir, 'test.json'), orient='records', lines=True)

# Create a README file with dataset description
readme_text = f"""
# 1D Cellular Automata Dataset

## Structure

- `rule`: The rule number in binary format
- `t=0`, `t=1`, ..., `t=T`: The states of the CA at each timestep

## Splits

- Training: 80%
- Validation: 10%
- Test: 10%

## Parameters:

| Parameter         | Description                                |
|-------------------|--------------------------------------------|
| r (int): `{r}`    | The radius of the CA rule.                 |
| size (int): `{size}` | The number of cells in the CA.             |
| T (int): `{T}`    | The number of steps for the CA to evolve.  |
| num_samples (int): `{num_samples}` | The number of samples in the dataset. |

"""

with open(os.path.join(dataset_dir, 'README.md'), 'w') as file:
    file.write(readme_text)


---

In [4]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [6]:
pwd

'/content/drive/MyDrive/Colab Notebooks/TransformerECA'

In [5]:
import os
import json
from datasets import load_dataset

# Constants
r = 4
size = 20
T = 20
num_samples = 1_111_111
test_samples = 100_000

# Directory for dataset files
dataset_dir = '1dCA_r'+str(r)+'s'+str(size)+'T'+str(T)


In [7]:
# Load the dataset using Hugging Face's load_dataset function
data_files = {
    'train': os.path.join(dataset_dir, 'train.json'),
    'validation': os.path.join(dataset_dir, 'validation.json'),
    'test': os.path.join(dataset_dir, 'test.json')
}

dataset = load_dataset('json', data_files=data_files, split='train+validation+test')
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['rule', 't=0', 't=1', 't=2', 't=3', 't=4', 't=5', 't=6', 't=7', 't=8', 't=9', 't=10', 't=11', 't=12', 't=13', 't=14', 't=15', 't=16', 't=17', 't=18', 't=19'],
    num_rows: 1111111
})


In [9]:
CA_data = load_dataset(dataset_dir)['train']

Repo card metadata block was not found. Setting CardData to empty.


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [10]:
print(CA_data[42])

{'rule': '10001101110110101001011101111100101011111011010010000110111000000101100010000100111001100000100101001011111010001111111001010100011001001000000001001000001001100010111010100010010010001100011100010101010111011000111111001111111101111100101010011100010100110001110000100011000111110011111100101000110010001001010111010100110000010101001001011010011001101110101001100000010011011000111011110011110110011100011110001101010010110000101001000111000000100010110010101001111111001100100010100101111011111000011001001110', 't=0': '01011110000110001100', 't=1': '11000110010000010001', 't=2': '11101001010111100001', 't=3': '00111000000101100110', 't=4': '10111101011100011100', 't=5': '10010100101110011110', 't=6': '10011111001101101011', 't=7': '01110010000000101101', 't=8': '10100110000011110001', 't=9': '01010010010000000001', 't=10': '10001101100000100101', 't=11': '10001111010111010010', 't=12': '10101001001101110011', 't=13': '10100110010101001010', 't=14': '10110000010000100011', 't=15

In [None]:
df

In [None]:
! ls

 1dCA_GPU0_RuleTask.ipynb	   1dCA_r2s84T6
 1dCA_GPU0_Transformer.ipynb	   1dCA_r3s24T20
 1dCA_GPU1_Transformer.ipynb	   babilong
 1dCA_GPU2_Transformer.ipynb	   babilong_evals
 1dCA_GPU3_Transformer.ipynb	   docs_docs.json
 1dCA_GPU4_Transformer.ipynb	   eval_Gemma_babilong.ipynb
 1dCA_GPU6_RuleTask.ipynb	  'eval_GPT_4x_BABILong(NeurIPS24).ipynb'
 1dCA_GPU7_RuleTask.ipynb	   eval_Llama3_babilong.ipynb
 1dCA.ipynb			   eval_Llama-3.ipynb
'1dCA[M]_GPU0_Transformer.ipynb'   eval_llm_babilong.ipynb
'1dCA[M]_GPU1_Transformer.ipynb'   eval_RAG_Llama3_babilong-Copy1.ipynb
'1dCA[M]_GPU5_RuleTask.ipynb'	   eval_RAG_Llama3_babilong-Copy2.ipynb
 1dCA_r2s20T20			   gemma-2B-10M
 1dCA_r2s24T20			   lost+found


In [None]:
! zip -r 1dCA_r4s20T20.zip  1dCA_r4s20T20

/bin/bash: line 1: zip: command not found


In [None]:
!