In [1]:
import sys
import h5py
import numpy as np
from scipy.io import loadmat

# sys.argv should be file paths to train.mat, valid.mat, test.mat

def load_files(train_file, valid_file, test_file):
    with h5py.File(train_file, 'r') as train_data:
        train_labels = np.transpose(train_data['traindata'], (1, 0))

        traindata = train_data['trainxdata']
        train_inputs = np.empty((4400000, 1000), dtype=np.float32)
        for i in range(int(4400000/1000)):
            train_inputs[i*1000:(i+1)*1000] = np.transpose(np.argmax(traindata[:, :, i*1000:(i+1)*1000], axis=1))

    valid_data = loadmat(valid_file)

    valid_labels = valid_data['validdata']
    valid_inputs = np.argmax(valid_data['validxdata'], axis = 1)

    test_data = loadmat(test_file)

    test_labels = test_data['testdata']
    test_inputs = np.argmax(test_data['testxdata'], axis = 1)

    return train_inputs, train_labels, valid_inputs, valid_labels, test_inputs, test_labels

def get_kmers(inputs, labels, dna_dict, len_kmer):
    num_input = len(inputs)
    print(num_input)
    kmers = np.empty((num_input+1, 3), dtype=object)
    print(kmers.shape)
    kmers[0,0] = "sequence"
    kmers[0,1] = "fake_label"
    kmers[0,2] = "real_label"
    for i in range(len(inputs)):
        # just to track time in big files -- can delete this
        if i % 10000 == 0:
            print(i/len(inputs))

        kmers[i+1, 0] = ''
        seq = ''
        for j in range(len(inputs[0])):
            seq += dna_dict[inputs[i-1,j]] # get sequence

        for j in range(len(inputs[0]) - k):
            kmers[i+1, 0] += seq[j:j+k]
            kmers[i+1, 0] += ' ' # separate kmers by spaces

        kmers[i+1, 1] = 1 # fake label
        label_list = list(labels[i].astype(int))
        #print(type(label_list[0]))
        label_string = []
        for label in label_list:
            label_string.append(str(label))
        kmers[i+1, 2] = " ".join(label_string) # real label
        

    return kmers

In [2]:
dna_dict = {
    0: "A",
    1: "C",
    2: "T",
    3: "G"
}

In [3]:
ks = [3,4,5,6]

In [4]:
train_file = "./deepsea_train/train.mat"
valid_file = "./deepsea_train/valid.mat"
test_file = "./deepsea_train/test.mat"
train_inputs, train_labels, valid_inputs, valid_labels, test_inputs, test_labels = load_files(train_file, valid_file, test_file)

In [5]:
len(train_inputs)

4400000

In [6]:
len(test_inputs)

455024

In [5]:
#ks = [3,4,5,6]
k = 3

print("starting validation 3-mers")
valid_kmers = get_kmers(valid_inputs, valid_labels, dna_dict, k)
np.savetxt('./DNABert/examples/DeepSea_data/3/valid.gz', valid_kmers, fmt='%s', delimiter='\t')

print("starting testing 3-mers")
test_kmers = get_kmers(test_inputs, test_labels, dna_dict, k)
np.savetxt('./DNABert/examples/DeepSea_data/3/dev.gz', test_kmers, fmt='%s', delimiter='\t')

print("starting training 3-mers")
train_kmers = get_kmers(train_inputs, train_labels, dna_dict, k)
np.savetxt('./DNABert/examples/DeepSea_data/3/train.gz', train_kmers, fmt='%s', delimiter='\t')


starting validation 3-mers
8000
(8001, 3)
0.0
starting testing 3-mers
455024
(455025, 3)
0.0
0.021976862758887442
0.043953725517774885
0.06593058827666233
0.08790745103554977
0.10988431379443722
0.13186117655332466
0.1538380393122121
0.17581490207109954
0.19779176482998698
0.21976862758887444
0.24174549034776188
0.2637223531066493
0.28569921586553676
0.3076760786244242
0.32965294138331164
0.3516298041421991
0.3736066669010865
0.39558352965997395
0.41756039241886145
0.4395372551777489
0.4615141179366363
0.48349098069552376
0.5054678434544112
0.5274447062132986
0.5494215689721861
0.5713984317310735
0.593375294489961
0.6153521572488484
0.6373290200077358
0.6593058827666233
0.6812827455255107
0.7032596082843982
0.7252364710432856
0.747213333802173
0.7691901965610605
0.7911670593199479
0.8131439220788355
0.8351207848377229
0.8570976475966103
0.8790745103554978
0.9010513731143852
0.9230282358732727
0.9450050986321601
0.9669819613910475
0.988958824149935
starting training 3-mers
4400000
(4400

0.9045454545454545
0.9068181818181819
0.9090909090909091
0.9113636363636364
0.9136363636363637
0.9159090909090909
0.9181818181818182
0.9204545454545454
0.9227272727272727
0.925
0.9272727272727272
0.9295454545454546
0.9318181818181818
0.9340909090909091
0.9363636363636364
0.9386363636363636
0.9409090909090909
0.9431818181818182
0.9454545454545454
0.9477272727272728
0.95
0.9522727272727273
0.9545454545454546
0.9568181818181818
0.9590909090909091
0.9613636363636363
0.9636363636363636
0.9659090909090909
0.9681818181818181
0.9704545454545455
0.9727272727272728
0.975
0.9772727272727273
0.9795454545454545
0.9818181818181818
0.9840909090909091
0.9863636363636363
0.9886363636363636
0.990909090909091
0.9931818181818182
0.9954545454545455
0.9977272727272727


In [6]:
k = 4

print("starting validation 4-mers")
valid_kmers = get_kmers(valid_inputs, valid_labels, dna_dict, k)
np.savetxt('./DNABert/examples/DeepSea_data/4/valid.gz', valid_kmers, fmt='%s', delimiter='\t')

print("starting testing 4-mers")
test_kmers = get_kmers(test_inputs, test_labels, dna_dict, k)
np.savetxt('./DNABert/examples/DeepSea_data/4/dev.gz', test_kmers, fmt='%s', delimiter='\t')

print("starting training 4-mers")
train_kmers = get_kmers(train_inputs, train_labels, dna_dict, k)
np.savetxt('./DNABert/examples/DeepSea_data/4/train.gz', train_kmers, fmt='%s', delimiter='\t')

starting validation 4-mers
8000
(8001, 3)
0.0
starting testing 4-mers
455024
(455025, 3)
0.0
0.021976862758887442
0.043953725517774885
0.06593058827666233
0.08790745103554977
0.10988431379443722
0.13186117655332466
0.1538380393122121
0.17581490207109954
0.19779176482998698
0.21976862758887444
0.24174549034776188
0.2637223531066493
0.28569921586553676
0.3076760786244242
0.32965294138331164
0.3516298041421991
0.3736066669010865
0.39558352965997395
0.41756039241886145
0.4395372551777489
0.4615141179366363
0.48349098069552376
0.5054678434544112
0.5274447062132986
0.5494215689721861
0.5713984317310735
0.593375294489961
0.6153521572488484
0.6373290200077358
0.6593058827666233
0.6812827455255107
0.7032596082843982
0.7252364710432856
0.747213333802173
0.7691901965610605
0.7911670593199479
0.8131439220788355
0.8351207848377229
0.8570976475966103
0.8790745103554978
0.9010513731143852
0.9230282358732727
0.9450050986321601
0.9669819613910475
0.988958824149935
starting training 4-mers
4400000
(4400

0.9045454545454545
0.9068181818181819
0.9090909090909091
0.9113636363636364
0.9136363636363637
0.9159090909090909
0.9181818181818182
0.9204545454545454
0.9227272727272727
0.925
0.9272727272727272
0.9295454545454546
0.9318181818181818
0.9340909090909091
0.9363636363636364
0.9386363636363636
0.9409090909090909
0.9431818181818182
0.9454545454545454
0.9477272727272728
0.95
0.9522727272727273
0.9545454545454546
0.9568181818181818
0.9590909090909091
0.9613636363636363
0.9636363636363636
0.9659090909090909
0.9681818181818181
0.9704545454545455
0.9727272727272728
0.975
0.9772727272727273
0.9795454545454545
0.9818181818181818
0.9840909090909091
0.9863636363636363
0.9886363636363636
0.990909090909091
0.9931818181818182
0.9954545454545455
0.9977272727272727


In [12]:
k = 5

# print("starting validation 5-mers")
# valid_kmers = get_kmers(valid_inputs, valid_labels, dna_dict, k)
# np.savetxt('./DNABert/examples/DeepSea_data/5/valid.gz', valid_kmers, fmt='%s', delimiter='\t')

# print("starting testing 5-mers")
# test_kmers = get_kmers(test_inputs, test_labels, dna_dict, k)
# np.savetxt('./DNABert/examples/DeepSea_data/5/dev.gz', test_kmers, fmt='%s', delimiter='\t')

print("starting training 5-mers")
train_kmers = get_kmers(train_inputs, train_labels, dna_dict, k)
np.savetxt('./DNABert/examples/DeepSea_data/5/train.gz', train_kmers, fmt='%s', delimiter='\t')

starting training 5-mers
4400000
(4400001, 3)
0.0
0.0022727272727272726
0.004545454545454545
0.006818181818181818
0.00909090909090909
0.011363636363636364
0.013636363636363636
0.015909090909090907
0.01818181818181818
0.020454545454545454
0.022727272727272728
0.025
0.02727272727272727
0.029545454545454545
0.031818181818181815
0.03409090909090909
0.03636363636363636
0.038636363636363635
0.04090909090909091
0.04318181818181818
0.045454545454545456
0.04772727272727273
0.05
0.05227272727272727
0.05454545454545454
0.056818181818181816
0.05909090909090909
0.06136363636363636
0.06363636363636363
0.0659090909090909
0.06818181818181818
0.07045454545454545
0.07272727272727272
0.075
0.07727272727272727
0.07954545454545454
0.08181818181818182
0.08409090909090909
0.08636363636363636
0.08863636363636364
0.09090909090909091
0.09318181818181819
0.09545454545454546
0.09772727272727273
0.1
0.10227272727272728
0.10454545454545454
0.10681818181818181
0.10909090909090909
0.11136363636363636
0.11363636363636

In [None]:
k = 6

# print("starting validation 6-mers")
# valid_kmers = get_kmers(valid_inputs, valid_labels, dna_dict, k)
# np.savetxt('./DNABert/examples/DeepSea_data/6/valid.gz', valid_kmers, fmt='%s', delimiter='\t')

print("starting testing 6-mers")
test_kmers = get_kmers(test_inputs, test_labels, dna_dict, k)
np.savetxt('./DNABert/examples/DeepSea_data/6/dev.gz', test_kmers, fmt='%s', delimiter='\t')

print("starting training 6-mers")
train_kmers = get_kmers(train_inputs, train_labels, dna_dict, k)
np.savetxt('./DNABert/examples/DeepSea_data/6/train.gz', train_kmers, fmt='%s', delimiter='\t')

starting testing 6-mers
455024
(455025, 3)
0.0
0.021976862758887442
0.043953725517774885
0.06593058827666233
0.08790745103554977
0.10988431379443722
0.13186117655332466
0.1538380393122121
0.17581490207109954
0.19779176482998698
0.21976862758887444
0.24174549034776188
0.2637223531066493
0.28569921586553676
0.3076760786244242
0.32965294138331164
0.3516298041421991
0.3736066669010865
0.39558352965997395
0.41756039241886145
0.4395372551777489
0.4615141179366363
0.48349098069552376
0.5054678434544112
0.5274447062132986
0.5494215689721861
0.5713984317310735
0.593375294489961
0.6153521572488484
0.6373290200077358
0.6593058827666233
0.6812827455255107
0.7032596082843982
0.7252364710432856
0.747213333802173
0.7691901965610605
0.7911670593199479
0.8131439220788355
0.8351207848377229
0.8570976475966103
0.8790745103554978
0.9010513731143852
0.9230282358732727
0.9450050986321601
0.9669819613910475
0.988958824149935
starting training 6-mers
4400000
(4400001, 3)
0.0
0.0022727272727272726
0.0045454545