### Artificially doubling training datasets by reflecting / reversing amino acid and structure orders.

In [1]:
import numpy as np
import os

In [2]:
FILE = "data/cullpdb+profile_6133_filtered.npy.gz"
data = np.load(FILE)
# reshape to 700 residues by 57-length feature vectors;
# -1 denotes "whatever dimension fits here" for variable number of proteins
data = data.reshape(-1, 700, 57)

In [3]:
start = 0
end = len(data)

new_data = np.zeros_like(data)
for i in range(start, end):
    protein = data[i]
    # two cases: with or without 'NoSeq' padding
    # without padding - last residue is not 'NoSeq':
    if protein[-1][21] == 0:
        # reverse by iterating backwards
        new_data[i] = protein[::-1]
    # with padding - only reverse the valid residues, leave padding at end
    else:
        new_protein = [protein[i] for i in range(len(protein)-1, -1, -1) if protein[i][21] == 0]
        padding = [protein[i] for i in range(len(protein)) if protein[i][21] != 0]
        new_data[i] = np.vstack((new_protein, padding))

In [4]:
print(data[1][0])
print(data[1][-553])
print(new_data[1][-553])
print(new_data[1][0])

[ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          1.          0.          0.          0.          0.          0.
  0.          1.          0.          0.          0.          0.          0.
  0.          0.          0.          1.          0.          1.          1.
  0.03455623  0.00364771  0.05215356  0.2404891   0.00343602  0.01551975
  0.08166025  0.00463257  0.99330717  0.00769087  0.03167623  0.67699581
  0.02231344  0.99712843  0.95968956  0.07175755  0.04269665  0.00776757
  0.00630555  0.26894143  0.01567329  0.        ]
[ 0.          0.          0.          0.          1.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          1.          0.          0.          0.          0.          0.
  0.          0.          0.          

In [10]:
SAVE = "data/cullpdb+profile_6133_filtered_reversed.npy.gz"
save_data = np.vstack((new_data, data)).reshape(-1, 700*57)
with open(SAVE, 'wb') as f:
    np.save(f, save_data)