- simulated data
- transformer + dataset construction
- pretrain

- analyses

In [1]:
from string import ascii_lowercase

from itertools import permutations
import numpy as np
import pandas as pd
from tqdm import tqdm

import transformers
from transformers import (
    # CONFIG_MAPPING,
    # MODEL_FOR_CAUSAL_LM_MAPPING,
    # AutoConfig,
    # AutoModelForCausalLM,
    # AutoTokenizer,
    # HfArgumentParser,
    # Trainer,
    # TrainingArguments,
    # default_data_collator,
    set_seed,
)

In [2]:
set_seed(1234)

## concepts

In [3]:
# mixture of multually exclusive and independent latent concepts
# symmatrical matrices

concept1 = np.array([   # positivity bias
    [1, 0, 1, 1],  # intention: pos
    [0, 0, 0, 0],  # intention: neg
    [1, 0, 1, 0],  # result: pos
    [1, 0, 0, 1]   # result: neg
])

concept2 = np.array([   # swing
    [1, 1, 1, 0],
    [1, 1, 0, 1],
    [1, 0, 1, 1],
    [0, 1, 1, 1]
])

concept3 = np.array([   # negativity bias
    [0, 0, 0, 0],
    [0, 1, 1, 1],
    [0, 1, 1, 0],
    [0, 1, 0, 1]
])

concepts = [concept1, concept2, concept3]

In [4]:
def normalize(v, ord=1):
    v = np.array(v, dtype=float)  # change dtype to floats
    norm = np.linalg.norm(v, ord)
    if norm == 0: 
       return v
    v[v != 0] /= norm
    return v


# training data gt (TODO: does the model capture this prob distr and use it as prior during inf?)
# do not mix concepts when generating training data?
probs_train_gt = normalize([1, 1, 1], ord=1)
print(probs_train_gt)

# testing data gt
probs_test_gt = normalize([1, 1, 1], ord=1)
print(probs_test_gt)

[0.33333333 0.33333333 0.33333333]
[0.33333333 0.33333333 0.33333333]


In [5]:
# aggregated gt concept
concept_test_gt = sum(prob * concept for prob, concept in zip(probs_test_gt, concepts))
concept_test_gt

array([[0.66666667, 0.33333333, 0.66666667, 0.33333333],
       [0.33333333, 0.66666667, 0.33333333, 0.66666667],
       [0.66666667, 0.33333333, 1.        , 0.33333333],
       [0.33333333, 0.66666667, 0.33333333, 1.        ]])

## event vocab

In [6]:
n_states = concepts[0].shape[0] + 1
n_events = 50

In [7]:
def gen_state2event(n_states, n_events):

    # n_states includes end state (always 0)
    # n_events includes end event ("/")

    state2event = {i:[] for i in range(n_states)}

    # state 0 (end) always instantiates as event 0 ("/")
    state2event[0] = [0]

    # other states
    for i in range(1, n_events):
        state2event[np.random.choice(range(1, n_states))].append(i)

    return state2event

In [8]:
# instantiate event from latent state
# TODO: use different token seqs to denote different states (tokens may be reused across states, but seqs not), instead of single tokens

instantiation = gen_state2event(n_states, n_events)
instantiation

# instantiation = {
#     0: ["/"],
#     1: ["a", "b", "c", "d", "e", "f", "ab", "ac", "ad", "ae"],
#     2: ["h", "i", "j", "k", "l", "m", "n", "o", "p", "af", "ag", "ah", "ai", "aj", "ak", "al"],
#     3: ["q", "r", "s", "t", "am", "an", "ao", "ap", "aq"],
#     4: ["u", "v", "w", "x", "y", "z", "ar", "as", "at", "au", "av", "aw", "ax"]
# }

{0: [0],
 1: [5, 6, 7, 17, 18, 22, 23, 24, 26, 33, 35, 43, 46],
 2: [4, 8, 10, 12, 25, 27, 36, 44],
 3: [3, 13, 14, 16, 19, 20, 21, 29, 30, 32, 37, 38, 39, 48],
 4: [1, 2, 9, 11, 15, 28, 31, 34, 40, 41, 42, 45, 47, 49]}

In [9]:
# https://github.com/p-lambda/incontext-learning/blob/84fab2141381001e33b5835e01f4fbf37f34a6a5/generate_data.py#L128
def letter_generator(num):
    counter = 0
    for i in range(1, len(ascii_lowercase)):
        for perm in permutations(ascii_lowercase, i):
            yield ''.join(perm)
            counter += 1
            if counter >= num:
                return
            

# generate vocab
vocab = list(letter_generator(n_events))
# replace delimiters with more interpretable tokens
vocab = ['/'] + vocab[:-1]

In [10]:
# https://github.com/p-lambda/incontext-learning/blob/84fab2141381001e33b5835e01f4fbf37f34a6a5/generate_data.py#L138
def apply_vocab(tokens, vocab):
    return [vocab[tok] for tok in tokens]

## generate obs

In [11]:
# generate: concept (transition matrix) >> latent state seq >> instantiate event
def gen_obs(concept, end_prob=0.5, sample_length=None):

    state_seq = []
    event_seq = []

    init_state_probs = concept.diagonal()
    state_present = init_state_probs > 0
    num_states = sum(state_present)

    # append a col of end state to the left of concept mtx, with each prob 1/num_states
    end_prob = (end_prob * state_present / num_states).reshape(-1, 1)  # convert to shape nx1
    concept = np.hstack((end_prob, concept))  # append to the left of concept mtx

    while True:

        # randomly select a state accroding to their probs to start chaining
        state_chosen = np.random.choice(len(init_state_probs), p=normalize(init_state_probs)) + 1  # states are 1-indexed; 0 denotes end state
        state_seq.append(state_chosen)
        # instantiate event from state
        event_seq.append(np.random.choice(instantiation[state_chosen]))

        # generate next states/events
        while state_chosen != 0:
            if sample_length and len(state_seq) >= sample_length: 
                return state_seq, event_seq
            state_probs = concept[state_chosen-1]
            state_probs[state_chosen] = 0  # diag of concept mtx denotes presence of states instead of self-self transition prob
            state_chosen = np.random.choice(len(state_probs), p=normalize(state_probs))
            # print(normalize(state_probs), sum(normalize(state_probs)))
            # print('state_chosen', state_chosen, state_probs)
            state_seq.append(state_chosen)
            event_seq.append(np.random.choice(instantiation[state_chosen]))
        
        if sample_length is None: 
            return state_seq, event_seq

    # return state_seq, event_seq

In [12]:
# sanity
concept_test_gt_thres = concept_test_gt.copy()
concept_test_gt_thres[concept_test_gt_thres < 0.5] = 0
gen_obs(concept_test_gt_thres, end_prob=0.3, sample_length=20)

([2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 0, 4, 0, 3, 1, 0, 3, 1, 0, 4],
 [12, 42, 12, 42, 25, 49, 25, 2, 44, 41, 0, 40, 0, 21, 17, 0, 14, 23, 0, 31])

In [13]:
# sample from the list of concepts with probs_train_gt to gen_obs
# https://github.com/p-lambda/incontext-learning/blob/84fab2141381001e33b5835e01f4fbf37f34a6a5/generate_data.py#L319

def gen_samples(num_samples, id_concepts, end_prob, random_data, sample_length=10240):

    samples = []

    for i in tqdm(range(num_samples)):

        # randomly select concept to generate obs
        j = np.random.choice(len(id_concepts))
        # print('j', j)
        if not random_data:
            h, x = gen_obs(id_concepts[j], end_prob=end_prob, sample_length=sample_length)
        
        # if generating random_data
        else:
            h = np.random.randint(low=0, high=n_states, size=sample_length)
            x = np.random.randint(low=0, high=len(vocab), size=sample_length)
        x = apply_vocab(x, vocab)
        samples.append({'text': ' '.join(x), 'concept_idx': j, 'concept_type': 'id', 'hiddens': h})

    return samples

In [14]:
samples = gen_samples(num_samples=1000, id_concepts=concepts, end_prob=0.3, random_data=False, sample_length=1024)

len(samples[999]['hiddens'])

100%|██████████| 1000/1000 [01:02<00:00, 16.09it/s]


1024

In [15]:
def save_as_json(samples, save_path):
    df = pd.DataFrame(samples)
    df.to_json(save_path, orient='records', lines=True)

def samples_to_raw(samples, out_path):
    with open(out_path, 'w') as f:
        for sample in samples:
            f.write(sample['text'] + ' / ')

save_as_json(samples, '/net/scratch/shangao/latent-concept/data/train_N1000_L1024_E0.3.json')
samples_to_raw(samples, '/net/scratch/shangao/latent-concept/data/train_N1000_L1024_E0.3.txt')