# Create a light dataset for testing classification and regression

The input sequence is 9 floating values sampled from Gaussian white noise.

The regression output is the dot product of an exponential with the input.

The classification output is whether that dot product is positive or negative.

In [1]:
from copy import deepcopy as copy
import numpy as np
import os
import pandas as pd
from scipy.signal import fftconvolve
import sys

In [2]:
if not os.path.exists('data_s5/real/clf/scrambled'):
    os.makedirs('data_s5/real/clf/scrambled')
if not os.path.exists('data_s5/real/rgr/scrambled'):
    os.makedirs('data_s5/real/rgr/scrambled')
    
LOOK_BACK = 9

NSEQ = 1000
TAU = 2
h = (1/TAU)*np.exp(-np.arange(9)/TAU)

seqs = []
targs = []

for cseq in range(NSEQ):
    seq = np.random.randn(LOOK_BACK)
    seqs.append(seq)
    
    targ = h@seq
    targs.append(targ)

for idx_print in np.random.permutation(NSEQ)[:10]:
    print(seqs[idx_print], targs[idx_print])
    
seqs = np.array(seqs)
targs = np.array(targs)

[-0.76882451  0.3411574   0.10562655 -0.31120474  0.06795651 -0.5928674
  0.77345355  0.50286729  0.08192337] -0.2883791423534312
[ 0.68832503  0.88279488 -0.5330049   1.63388654  1.78347963  0.38800617
 -0.32580586  1.11570752 -1.52811886] 0.8274771375704637
[ 1.20193241  0.9958307  -1.13117265  0.64836884  0.11839382 -0.33767145
  0.70197304 -0.14462542  0.83435894] 0.7843192653972602
[-1.14781134  0.39478843 -1.30026685  0.48107412  1.80825945  1.06122844
  1.58110093 -0.11695383 -0.50463102] -0.44079153477614924
[ 1.157079   -1.73068134 -0.07649314 -0.94747926 -0.33225091 -0.35835407
 -0.51568028 -0.32706524 -0.24559764] -0.12330675898242638
[-1.47060612  0.23797227  0.15024652 -1.14751799  0.84681202 -0.92618798
 -0.65769954 -0.48370757 -0.4939125 ] -0.7724312298938907
[-0.7307854  -0.91396043 -0.72938041 -0.53036072 -0.40239852 -1.46899791
  1.35402085  0.53815518 -0.1995767 ] -0.8834135665150096
[-0.994616   -0.04630716 -0.69149583 -0.75417332 -0.75401317 -0.23454565
  1.0760057

In [3]:
PFXS = ['clf', 'rgr']

In [4]:
columns = ['target', 'input_seq']

paths_all = []

for pfx in PFXS:
    sys.stdout.write(f'pfx: {pfx}')
    
    input_seqs = copy(seqs)
    targets = copy(targs)
    
    if pfx == 'clf':
        targets = (targets > 0).astype(int)
        
    data_dicts = []
    
    for input_seq, target in zip(input_seqs, targets):
        data_dict = {'target': target, 'input_seq': ','.join([f'{val}' for val in input_seq])}
        data_dicts.append(data_dict)

    print('')
    df = pd.DataFrame(columns=columns, data=data_dicts)
    path = f'data_s5/real/{pfx}/full.tsv'
    df.to_csv(path, sep='\t', index=False, header=False)

    paths_all.append(path)
    
    # scrambled version
    data_dicts_scrambled = []
    
    for input_seq, target in zip(input_seqs, targets[np.random.permutation(len(targets))]):
        data_dict_scrambled = {'target': target, 'input_seq': ','.join([f'{val}' for val in input_seq])}
        data_dicts_scrambled.append(data_dict_scrambled)
        
    df_scrambled = pd.DataFrame(columns=columns, data=data_dicts_scrambled)
    path_scrambled = f'data_s5/real/{pfx}/scrambled/full.tsv'
    df_scrambled.to_csv(path_scrambled, sep='\t', index=False, header=False)

    paths_all.append(path_scrambled)

pfx: clf
pfx: rgr


In [5]:
for path in paths_all:
    sys.stdout.write(f'Loading {path}...\n')
    df = pd.read_csv(path, sep='\t', header=None)

    # split into training, val, and test (here val and test are same)
    nrow_train = int(len(df)*.8)
    df_train = df.iloc[:nrow_train, :]
    df_val = df.iloc[nrow_train:, :]
    df_test = df.iloc[nrow_train:, :]

    df_train.to_csv(path[:-8] + 'train.tsv', sep='\t', header=False, index=False)
    df_val.to_csv(path[:-8] + 'val.tsv', sep='\t', header=False, index=False)
    df_test.to_csv(path[:-8] + 'test.tsv', sep='\t', header=False, index=False)

Loading data_s5/real/clf/full.tsv...
Loading data_s5/real/clf/scrambled/full.tsv...
Loading data_s5/real/rgr/full.tsv...
Loading data_s5/real/rgr/scrambled/full.tsv...
