# Create an ultralight testing dataset

Regression target is number of S's in a length-9 token sequence, divided by 9. For classification threshold at .5.

In [1]:
from copy import deepcopy as copy
import numpy as np
import pandas as pd
from scipy.signal import fftconvolve
import sys

In [2]:
LOOK_BACK = 9

NSEQ = 1000

seqs = []
targs = []

for cseq in range(NSEQ):
    num_s = np.random.randint(0, LOOK_BACK+1)
    seq = ((LOOK_BACK-1)//2)*['Q', 'P'] + ['Q']
    for idx_s in np.random.permutation(LOOK_BACK)[:num_s]:
        seq[idx_s] = 'S'
        
    seqs.append(''.join(seq))
    
    targ = num_s/LOOK_BACK
    targs.append(targ)

for idx_print in np.random.permutation(NSEQ)[:10]:
    print(seqs[idx_print], targs[idx_print])
    
seqs = np.array(seqs)
targs = np.array(targs)

QPQPQPSPQ 0.1111111111111111
SSSSQSSSS 0.8888888888888888
SSSSSSSSS 1.0
SSSSQSSSS 0.8888888888888888
SSSSSSSSQ 0.8888888888888888
QPSPQSSSQ 0.4444444444444444
QSQPSSQPQ 0.3333333333333333
QPQPQPQPQ 0.0
QPQSQPQSS 0.3333333333333333
SSQSSSSSS 0.8888888888888888


In [3]:
PFXS = ['clf', 'clf_scrambled', 'rgr', 'rgr_scrambled']

In [5]:
columns = ['fmtn', 'session', 'frame', 'song']

paths_all = []

for pfx in PFXS:
    sys.stdout.write(f'pfx: {pfx}')
    
    songs = copy(seqs)
    fmtns = copy(targs)
    
    if pfx.startswith('clf'):
        fmtns = (fmtns > .5).astype(int)
    if pfx.endswith('scrambled'):
        fmtns = fmtns[np.random.permutation(NSEQ)]
        
    data_dicts = []
    
    for song, fmtn in zip(songs, fmtns):
        data_dict = {'fmtn': fmtn, 'session': 0, 'frame': 0, 'song': song}
        data_dicts.append(data_dict)

    print('')
    df = pd.DataFrame(columns=columns, data=data_dicts)
    path = f'data_s5/ultralight/{pfx}_lookback_{LOOK_BACK}.tsv'
    df.to_csv(path, sep='\t', index=False, header=False)

    paths_all.append(path)

pfx: clf
pfx: clf_scrambled
pfx: rgr
pfx: rgr_scrambled


In [6]:
for path in paths_all:
    sys.stdout.write(f'Loading {path}...\n')
    df = pd.read_csv(path, sep='\t', header=None)

    # split into training, val, and test (here val and test are same)
    nrow_train = int(len(df)*.8)
    df_train = df.iloc[:nrow_train, :]
    df_val = df.iloc[nrow_train:, :]
    df_test = df.iloc[nrow_train:, :]

    df_train.to_csv(path[:-4] + '.train.tsv', sep='\t', header=False, index=False)
    df_val.to_csv(path[:-4] + '.eval.tsv', sep='\t', header=False, index=False)
    df_test.to_csv(path[:-4] + '.test.tsv', sep='\t', header=False, index=False)

Loading data_s5/ultralight/clf_lookback_9.tsv...
Loading data_s5/ultralight/clf_scrambled_lookback_9.tsv...
Loading data_s5/ultralight/rgr_lookback_9.tsv...
Loading data_s5/ultralight/rgr_scrambled_lookback_9.tsv...
