Before moving on to single-cell predictions, let's look at an example that illustrates this situation: we'll generate synthetic autoregressive series of different lengths and train larger networks than we've used so far, and then compare them to the naïve model $x(t) = x(t-1)$.

In [None]:
def make_autoregressive_series(n):

    sigma = 20. # variance of noise term
    delta = 0. # drift
    phi1 = 0.75 # 1st-order autoregression coefficient
    w = np.random.normal(loc=0, scale=2, size=n) # noise vector
    t = np.arange(n) # time vector
    x = np.zeros(n, float) # initialize series
    T1, T2, T3 = 100000, 5000, 500 # periods of seasonal components
    s1, s2, s3 = np.sin(2.*np.pi/T1*t), np.sin(2.*np.pi/T2*t), np.sin(2*np.pi/T3*t)
    x[0] = 0.0 # first value of series

    # build series
    for i in range(1, n, 1):
        x[i] = delta + phi1*x[i-1] + + s1[i] + s2[i] + s3[i] + w[i]
    
    x = pd.DataFrame(x, index=t, columns=['x'])

    return x

x = make_autoregressive_series(n=300000)

big_datasets = split_and_standarize_data(x, f_train=0.6, f_val=0.2)

CONV_WIDTH = 50
LABEL_WIDTH = 1
INPUT_WIDTH = LABEL_WIDTH + (CONV_WIDTH - 1)

conv_bigdata = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=30, kernel_size=(CONV_WIDTH // 2,)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(30),
    tf.keras.layers.Dense(units=1),
    tf.keras.layers.Reshape((1, 1))
])

w_conv_bigdata = WindowGenerator(
    input_width=INPUT_WIDTH, 
    label_width=LABEL_WIDTH, 
    offset=1, 
    train_df=big_datasets['train_df'],
    val_df=big_datasets['val_df'],
    test_df=big_datasets['test_df'],
    batch_size=BATCH_SIZE,
    label_columns=['x'])

print(f'X shape = {w_conv_bigdata.example[0].shape}')
print(f'y_pred shape = {conv_bigdata(w_conv_bigdata.example[0]).shape}')
print(f'y_obs shape = {w_conv_bigdata.example[1].shape}')

conv_bigdata(w_conv_bigdata.example[0]) # build model to get number of parameters

print(f'Estimating {conv_bigdata.count_params():,} parameters on {w_conv_bigdata.train_df.shape[0]:,} datapoints')

history_conv_bigdata = compile_and_fit(conv_bigdata, w_conv_bigdata, max_epochs=MAX_EPOCHS, patience=20, verbose=1)

ypred_conv_bigdata = make_ypred(conv_bigdata, w_conv_bigdata, big_datasets['train_mean']['x'], big_datasets['train_sd']['x'])

fig, ax = plt.subplots()
ax.plot(history_conv_bigdata.history['loss'], label='train')
ax.plot(history_conv_bigdata.history['val_loss'], label='val')
ax.legend()
plt.show()


ypred['conv_bigdata'] = ypred_conv_bigdata
err_rel['conv_bigdata'] = 100. * (ypred['conv_bigdata'] - big_datasets['test_df_original']['x']) / big_datasets['test_df_original']['x']
err_abs['conv_bigdata'] = ypred['conv_bigdata'] - big_datasets['test_df_original']['x']

# add ypred for naive big data baseline model
ypred_naive_bigdata = big_datasets['test_df_original']['x'][:-1].values
ytest_naive_bigdata = big_datasets['test_df_original']['x'][1:].values
ypred['naive_bigdata'] = ypred_naive_bigdata
err_rel['naive_bigdata'] = 100. * (ypred['naive_bigdata'] - ytest_naive_bigdata)/ytest_naive_bigdata
err_abs['naive_bigdata'] = ypred['naive_bigdata'] - ytest_naive_bigdata

bins = np.arange(-300, 305, 5)
fig = plt.figure(figsize=(15, 4))
gs = gridspec.GridSpec(2, 4)

ax1 = fig.add_subplot(gs[:, :3])
ax1.plot(big_datasets['test_timestamps'], big_datasets['test_df_original']['x'], linewidth=1, color='black', label='y_test')
ax1.plot(big_datasets['test_timestamps'], ypred_conv_bigdata, linewidth=1, color='red', alpha=0.5, label=f'conv1d')
ax1.set_xlim(280000, 280400)
ax1.legend(loc=1)
ax2 = fig.add_subplot(gs[0, 3:])
ax2.set_title('Naive big data')
ax2.hist(err_rel['naive_bigdata'], bins=bins, density=True)
ax3 = fig.add_subplot(gs[1, 3:])
ax3.set_title('Conv1D big data')
ax3.hist(err_rel['conv_bigdata'], bins=bins, density=True)
plt.tight_layout()
plt.show()

perc_rel_errors_below_10['naive_bigdata'] = 100.*err_rel['naive_bigdata'][err_rel['naive_bigdata'] <= 10].size/err_rel['naive_bigdata'].size
perc_rel_errors_below_10['conv_bigdata'] = 100.*err_rel['conv_bigdata'][err_rel['conv_bigdata'] <= 10].size/err_rel['conv_bigdata'].size

print(perc_rel_errors_below_10['naive_bigdata'])
print(perc_rel_errors_below_10['conv_bigdata'])
