# Create an ultralight dataset for classification and regression testing

Regression target is number of A's in a length-9 token sequence, divided by 9. For classification threshold at .5.

In [1]:
from copy import deepcopy as copy
import numpy as np
import os
import pandas as pd
from scipy.signal import fftconvolve
import sys

In [2]:
if not os.path.exists('data_s5/ultralight/clf/scrambled'):
    os.makedirs('data_s5/ultralight/clf/scrambled')
if not os.path.exists('data_s5/ultralight/rgr/scrambled'):
    os.makedirs('data_s5/ultralight/rgr/scrambled')
    
LOOK_BACK = 9

NSEQ = 1000

seqs = []
targs = []

for cseq in range(NSEQ):
    num_a = np.random.randint(0, LOOK_BACK+1)
    seq = ((LOOK_BACK-1)//2)*['B', 'C'] + ['B']
    for idx_a in np.random.permutation(LOOK_BACK)[:num_a]:
        seq[idx_a] = 'A'
        
    seqs.append(''.join(seq))
    
    targ = num_a/LOOK_BACK
    targs.append(targ)

for idx_print in np.random.permutation(NSEQ)[:10]:
    print(seqs[idx_print], targs[idx_print])
    
seqs = np.array(seqs)
targs = np.array(targs)

BCBCBCBCB 0.0
AAACBAACB 0.5555555555555556
AAAABABCA 0.6666666666666666
BABCBCBCB 0.1111111111111111
AAAAAAACB 0.7777777777777778
AAAAAAAAA 1.0
BCAAACBAB 0.4444444444444444
AAACAAACB 0.6666666666666666
AAAAAAAAA 1.0
BABAACBCB 0.3333333333333333


In [3]:
PFXS = ['clf', 'rgr']

In [4]:
columns = ['target', 'session', 'frame', 'input_seq']

paths_all = []

for pfx in PFXS:
    sys.stdout.write(f'pfx: {pfx}')
    
    input_seqs = copy(seqs)
    targets = copy(targs)
    
    if pfx == 'clf':
        targets = (targets > .5).astype(int)
        
    data_dicts = []
    
    for input_seq, target in zip(input_seqs, targets):
        data_dict = {'target': target, 'session': 0, 'frame': 0, 'input_seq': input_seq}
        data_dicts.append(data_dict)

    print('')
    df = pd.DataFrame(columns=columns, data=data_dicts)
    path = f'data_s5/ultralight/{pfx}/full.tsv'
    df.to_csv(path, sep='\t', index=False, header=False)

    paths_all.append(path)
    
    # scrambled version
    data_dicts_scrambled = []
    
    for input_seq, target in zip(input_seqs, targets[np.random.permutation(len(targets))]):
        data_dict_scrambled = {'target': target, 'session': 0, 'frame': 0, 'input_seq': input_seq}
        data_dicts_scrambled.append(data_dict_scrambled)
        
    df_scrambled = pd.DataFrame(columns=columns, data=data_dicts_scrambled)
    path_scrambled = f'data_s5/ultralight/{pfx}/scrambled/full.tsv'
    df_scrambled.to_csv(path_scrambled, sep='\t', index=False, header=False)

    paths_all.append(path_scrambled)

pfx: clf
pfx: rgr


In [5]:
for path in paths_all:
    sys.stdout.write(f'Loading {path}...\n')
    df = pd.read_csv(path, sep='\t', header=None)

    # split into training, val, and test (here val and test are same)
    nrow_train = int(len(df)*.8)
    df_train = df.iloc[:nrow_train, :]
    df_val = df.iloc[nrow_train:, :]
    df_test = df.iloc[nrow_train:, :]

    df_train.to_csv(path[:-8] + 'train.tsv', sep='\t', header=False, index=False)
    df_val.to_csv(path[:-8] + 'eval.tsv', sep='\t', header=False, index=False)
    df_test.to_csv(path[:-8] + 'test.tsv', sep='\t', header=False, index=False)

Loading data_s5/ultralight/clf/full.tsv...
Loading data_s5/ultralight/clf/scrambled/full.tsv...
Loading data_s5/ultralight/rgr/full.tsv...
Loading data_s5/ultralight/rgr/scrambled/full.tsv...
