https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

In [1]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f0b8896ad70>

In [2]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[-0.0187,  0.1713, -0.2944]],

        [[-0.3521,  0.1026, -0.2971]],

        [[-0.3191,  0.0781, -0.1957]],

        [[-0.1634,  0.0941, -0.1637]],

        [[-0.3368,  0.0959, -0.0538]]], grad_fn=<MkldnnRnnLayerBackward0>)
(tensor([[[-0.3368,  0.0959, -0.0538]]], grad_fn=<StackBackward0>), tensor([[[-0.9825,  0.4715, -0.0633]]], grad_fn=<StackBackward0>))


In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    # Tags are: DET - determiner; NN - noun; V - verb
    # For example, the word "The" is a determiner
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}  # Assign each tag with a unique index

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [5]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        if not self.training:
            print(embeds)
            print(f'embed {embeds.dtype}')
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        #print(lstm_out.size())
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        #print(tag_space.size())
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [6]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.train()
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        print(f'tag_scores size {tag_scores.size()}')
        print(f'targets {targets.size()}')
        print(targets)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    model.eval()
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-1.1389, -1.2024, -0.9693],
        [-1.1065, -1.2200, -0.9834],
        [-1.1286, -1.2093, -0.9726],
        [-1.1190, -1.1960, -0.9916],
        [-1.0137, -1.2642, -1.0366]])
tag_scores size torch.Size([5, 3])
targets torch.Size([5])
tensor([0, 1, 2, 0, 1])
tag_scores size torch.Size([4, 3])
targets torch.Size([4])
tensor([1, 2, 0, 1])
tag_scores size torch.Size([5, 3])
targets torch.Size([5])
tensor([0, 1, 2, 0, 1])
tag_scores size torch.Size([4, 3])
targets torch.Size([4])
tensor([1, 2, 0, 1])
tag_scores size torch.Size([5, 3])
targets torch.Size([5])
tensor([0, 1, 2, 0, 1])
tag_scores size torch.Size([4, 3])
targets torch.Size([4])
tensor([1, 2, 0, 1])
tag_scores size torch.Size([5, 3])
targets torch.Size([5])
tensor([0, 1, 2, 0, 1])
tag_scores size torch.Size([4, 3])
targets torch.Size([4])
tensor([1, 2, 0, 1])
tag_scores size torch.Size([5, 3])
targets torch.Size([5])
tensor([0, 1, 2, 0, 1])
tag_scores size torch.Size([4, 3])
targets torch.Size([4])
tensor([1, 2, 0, 1])

# LSTM HGT classifier

In [7]:
from sklearn import preprocessing
import pandas as pd
import numpy as np
import os
import sys
import torch.nn as nn
#from torch.utils import data
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn.utils.rnn as rnn
from torch import autograd

# making my dataset more like the pipeline at works

warning.... thisis dataset cast data and length as float32! is this alright?!

In [14]:
class HGTDBDatasetSequential(torch.utils.data.Dataset):
    def __init__(self, data_type, partition_file, partition, drop_na=False):
        
        if partition not in ['train','test','valid']:
            raise ValueError('not partition or train, test or valid!')
        
        self.partition = partition
        self.data_type = data_type
        self.partition_file = partition_file
        self.min_max_scaler = preprocessing.MinMaxScaler()
        # self.one_hot_encoder = preprocessing.OneHotEncoder()
        self.drop_na = drop_na
        self.null_count = 0
        self.na_count = 0
        
        self.max_sequence = 0
        
        # load all in ram perhaps?
        self.data_x = []
        self.data_y = []
        self.data_seq_length = []
        
        
        self._init_dataset()
        

    
    def _load_single_file(self, species, data_type):
        '''
        replace this shit
        some of this shit is still legacy!!!
        '''

        preprocessed_path = "data/HGTDB/preprocessed_data"
        
        csv_list = os.listdir(preprocessed_path)
        
    
        if species is not None:
            csv_file = None
            for path in csv_list:
                # check if current path is a file
                if os.path.basename(path).replace(".csv", "") == species:
                    csv_file =os.path.join(preprocessed_path, path)
                    df = pd.read_csv(csv_file, index_col='ID')
            if csv_file is None:
                raise ValueError(f'{species} not found')
        else:
            raise ValueError(f'{species} not found')
            
        # set dataset according to data type
        if data_type == 'A':
            df = df.drop(columns=["FunctionCode","Strand","AADev","Length","SD1","SD2","SD3","SDT","Mah"]) # only GC1,GC2,GC3,GCT

        
        # count nulls!
        #df.bfill(inplace=True)
        self.null_count +=df.isnull().sum().sum()
        self.na_count +=df.isna().sum().sum()
        if df.isna().sum().sum() >0:
            print(df[df.isna().any(axis=1)])
            
        df = df.bfill(axis='columns')
        #for column in df.columns:
        #    df[column] = df[column].fillna(0)
        self.null_count +=df.isnull().sum().sum()
        self.na_count +=df.isna().sum().sum()
        if df.isna().sum().sum() >0:
            print(df[df.isna().any(axis=1)])
            

        
        
        #df.dropna(inplace=True)
        
        #after replacing nan
        #null_count = 0
        #na_count = 0
        #null_count +=df.isnull().sum().sum()
        #na_count +=df.isna().sum().sum()
        #print(null_count)
        #print(na_count)
        
        # return as numpy array
        # labels are not affected since there is only two options 0,1
        df=(df-df.min())/(df.max()-df.min())
        array = df.values
        x = array[:,0:-1]
        y = array[:,-1]
        y = np.expand_dims(y, axis=1)
        #OHE = preprocessing.OneHotEncoder()
        #OHE.fit(y)
        #y = OHE.transform(y).toarray()
        return x,y 
        
    
    def _init_dataset(self):
        '''
        '''
        partition_frame = pd.read_csv(self.partition_file)
        partition_frame = partition_frame[partition_frame['partition']==self.partition].reset_index(drop=True)
        
        for i in range(len(partition_frame)):
            x,y = self._load_single_file(partition_frame.loc[i,'file'], self.data_type)
            
            if self.max_sequence<len(x):
                self.max_sequence = len(x)
                
            
            self.data_x.append(torch.from_numpy(np.float32(x)))
            self.data_y.append(torch.from_numpy(np.float32(y)))
            self.data_seq_length.append(torch.tensor(len(x)))
            
        # pad 'sequences'
        # padded stuff are tagged as 0!
        self.data_x = pad_sequence(self.data_x, batch_first=True)
        self.data_y = pad_sequence(self.data_y, batch_first=True)

    def __getitem__(self, ind):
        datum = self.data_x[ind]
        label = self.data_y[ind]
        seq_length = self.data_seq_length[ind]
        
        output = {
            "datum" : datum,
            "seq_length" : seq_length,
            "label" : label
        }
        return output
    
    def __len__(self):
        return len(self.data_x)
        
        
            

In [15]:
#hgtdb_train = HGTDBDatasetSequential('A','partition_file/HGTDB_firmicutes.csv', 'train')
#print(hgtdb_train.max_sequence)
#hgtdb_test = HGTDBDatasetSequential('A','partition_file/HGTDB_firmicutes.csv', 'test')
#print(hgtdb_test.max_sequence)

#hgtdb_train = HGTDBDatasetSequential('A','partition_file/HGTDB_ALL_trisplit.csv', 'train')
#print(hgtdb_train.max_sequence)
#hgtdb_valid = HGTDBDatasetSequential('A','partition_file/HGTDB_ALL_trisplit.csv', 'valid')
#print(hgtdb_valid.max_sequence)
#hgtdb_test = HGTDBDatasetSequential('A','partition_file/HGTDB_ALL_trisplit.csv', 'test')
#print(hgtdb_test.max_sequence)


hgtdb_train = HGTDBDatasetSequential('A','partition_file/HGTDB_firmicutes_trisplit.csv', 'train')
print(hgtdb_train.max_sequence)
hgtdb_valid = HGTDBDatasetSequential('A','partition_file/HGTDB_firmicutes_trisplit.csv', 'valid')
print(hgtdb_valid.max_sequence)
hgtdb_test = HGTDBDatasetSequential('A','partition_file/HGTDB_firmicutes_trisplit.csv', 'test')
print(hgtdb_test.max_sequence)

         GC1  GC2  GC3  GCT  HGT
ID                              
SCO3176  NaN  NaN  NaN  NaN    0
4187
782
2660


In [16]:
train_loader = torch.utils.data.DataLoader(dataset=hgtdb_train,batch_size=2,shuffle=True)
valid_loader = torch.utils.data.DataLoader(dataset=hgtdb_valid,batch_size=2,shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=hgtdb_test,batch_size=2,shuffle=True)

checking if there is nan inside the dataloaders

In [17]:
for (idx, data) in enumerate(train_loader):
    print(torch.isnan(data['datum']).any())

tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor(False)


In [18]:
for (idx, data) in enumerate(valid_loader):
    print(torch.isnan(data['datum']).any())

tensor(False)
tensor(False)


In [19]:
for (idx, data) in enumerate(test_loader):
    print(torch.isnan(data['datum']).any())

tensor(False)
tensor(False)


none of my dataset has nan!

based on examples -> sequence x embedding size

our case -> minibatch x sequence x embedding size

In [20]:
for (idx, data) in enumerate(train_loader):
    #print(data)
    print(data['datum'].size())
    input = rnn.pack_padded_sequence(data['datum'], lengths=data['seq_length'], batch_first=True, enforce_sorted=False)
    #print(input)
    print(data['label'].size())
    

torch.Size([2, 4187, 4])
torch.Size([2, 4187, 1])
torch.Size([2, 4187, 4])
torch.Size([2, 4187, 1])
torch.Size([2, 4187, 4])
torch.Size([2, 4187, 1])
torch.Size([2, 4187, 4])
torch.Size([2, 4187, 1])
torch.Size([2, 4187, 4])
torch.Size([2, 4187, 1])
torch.Size([2, 4187, 4])
torch.Size([2, 4187, 1])
torch.Size([2, 4187, 4])
torch.Size([2, 4187, 1])
torch.Size([2, 4187, 4])
torch.Size([2, 4187, 1])
torch.Size([2, 4187, 4])
torch.Size([2, 4187, 1])


tag_scores is sigmoided... is this correct?

modified as binary classifier. Inspired by https://machinelearningmastery.com/building-a-binary-classification-model-in-pytorch/

In [21]:
class LSTMHGTTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, tagset_size):
        super(LSTMHGTTagger, self).__init__()
        self.last_epoch= 0
        self.hidden_dim = hidden_dim

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        
        self.relu = nn.ReLU()
        self.hidden2hidden=nn.Linear(hidden_dim,hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, input):
        
        data, seq_length = input
        
        # pack padded sequence.. exp from work
        input = rnn.pack_padded_sequence(data, lengths=seq_length, batch_first=True, enforce_sorted=False)
        
        #input to model
        lstm_out, _ = self.lstm(input)
        
        # unpack
        # apparently this unpacks them? https://gist.github.com/HarshTrivedi/f4e7293e941b17d19058f6fb90ab0fec
        output, input_sizes = rnn.pad_packed_sequence(lstm_out, batch_first=True)
        
        # to hidden, softmax and sigmoid!
        output = self.relu(self.hidden2hidden(output))
        output = torch.sigmoid(self.hidden2tag(output))
        
        # tag_scores = F.log_softmax(tag_space, dim=1)
        # tag_scores = F.softmax(tag_space, dim=1)
        return output, input_sizes
        #return tag_scores, input_sizes

In [22]:
class LSTMHGTTagger_v2(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, tagset_size):
        super(LSTMHGTTagger_v2, self).__init__()
        self.last_epoch= 0
        self.hidden_dim = hidden_dim

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.50)
        self.relu = nn.ReLU()
        self.hidden2hidden=nn.Linear(hidden_dim,hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, input):
        
        data, seq_length = input
        
        # pack padded sequence.. exp from work
        input = rnn.pack_padded_sequence(data, lengths=seq_length, batch_first=True, enforce_sorted=False)
        
        #input to model
        lstm_out, _ = self.lstm(input)
        
        # unpack
        # apparently this unpacks them? https://gist.github.com/HarshTrivedi/f4e7293e941b17d19058f6fb90ab0fec
        output, input_sizes = rnn.pad_packed_sequence(lstm_out, batch_first=True)
        
        output = self.dropout(output)
        output = self.relu(self.hidden2hidden(output))
        output = torch.sigmoid(self.hidden2tag(output))
        
        # tag_scores = F.log_softmax(tag_space, dim=1)
        # tag_scores = F.softmax(tag_space, dim=1)
        return output, input_sizes
        #return tag_scores, input_sizes

In [23]:
class LSTMHGTTagger_v3(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, tagset_size):
        super(LSTMHGTTagger_v3, self).__init__()
        
        self.last_epoch= 0
        self.hidden_dim = hidden_dim

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.25)
        # self.relu = nn.ReLU()
        self.act_func = nn.Sigmoid()
        self.hidden2hidden=nn.Linear(hidden_dim,hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, input):
        
        data, seq_length = input
        
        # pack padded sequence.. exp from work
        input = rnn.pack_padded_sequence(data, lengths=seq_length, batch_first=True, enforce_sorted=False)
        
        #input to model
        lstm_out, _ = self.lstm(input)
        
        # unpack
        # apparently this unpacks them? https://gist.github.com/HarshTrivedi/f4e7293e941b17d19058f6fb90ab0fec
        output, input_sizes = rnn.pad_packed_sequence(lstm_out, batch_first=True)
        
        output = self.dropout(output)
        output = self.act_func(self.hidden2hidden(output))
        output = self.act_func(self.hidden2tag(output))
        
        # tag_scores = F.log_softmax(tag_space, dim=1)
        # tag_scores = F.softmax(tag_space, dim=1)
        return output, input_sizes
        #return tag_scores, input_sizes

is BCE correct?

In [24]:
# model = LSTMHGTTagger(4, 100, 2)
#model = LSTMHGTTagger(4, 100, 1)
#model = LSTMHGTTagger_v2(4,100,1)
model = LSTMHGTTagger_v3(4,100,1)

# loss_function = nn.NLLLoss()
loss_function = nn.BCELoss()
#loss_function = nn.CrossEntropyLoss
# optimizer = optim.SGD(model.parameters(), lr=0.1)
optimizer = optim.Adam(model.parameters(), lr=0.00001)

In [25]:
from sklearn.metrics import classification_report, accuracy_score

why does size of tag scores and data label have different sizes?
datasethas nan issues!
current solution to replace nan is not working as expected!!

In [26]:
def test_model(model,dataloader):
    y_true_list = []
    y_pred_list = []
    
    for (idx, data) in enumerate(dataloader):
        model.eval()
        with torch.no_grad():
            inputs = (data['datum'], data['seq_length'])
            output, input_sizes = model(inputs)
            
            targets = rnn.pack_padded_sequence(data['label'], lengths=data['seq_length'], batch_first=True, enforce_sorted=False)
            targets,_ = rnn.pad_packed_sequence(targets, batch_first=True)
            
            #accuracy = (output.round() == targets).float().mean()
            #print(accuracy)
            

        targets = torch.squeeze(targets,2)
        # the squeeze here is needed and fine
        output = torch.squeeze(output,2)
        
        # this one not sure! it takes away the batch!
        

        for i in range(targets.size(0)):
            y_true_list = [*y_true_list, *targets[i].detach().numpy()]
            y_pred_list = [*y_pred_list, *output[i].detach().round().numpy()]

    #print(len(y_pred_list))
    #print(len(y_true_list))
    acc = accuracy_score(y_true_list, y_pred_list)
    print(f'test acc: {acc}')
    
    #print(np.isnan(y_pred_list))
    #try:
    #    acc = accuracy_score(y_true_list, y_pred_list)
    #except:
    #    print(y_pred_list)
    #    print(y_true_list)
    #print(acc)
    #report = classification_report(y_true_list, y_pred_list)      
    #pred_ones = (preds_list.round() == 1.).float().sum()
    #actual_ones = (targets_list == 1.).float().sum()
    #print(f'pred hgt: {pred_ones}')
    #print(f'actual hgt: {actual_ones}')
    #print(f'percentage {pred_ones/actual_ones}')

autograd.detect_anomaly
with autograd.detect_anomaly():

In [27]:
def train_model(model,loss_function, optimizer, train_loader, valid_loader, epochs=10):
    for epoch in range(epochs):
        model.last_epoch += 1
        print(f'training epoch:{model.last_epoch}')
        acc_loss = 0.
        for (idx, data) in enumerate(train_loader):
            # print(idx)
            model.train()
            model.zero_grad()
            
            
            inputs = (data['datum'], data['seq_length'])
            output,_ = model(inputs)
            
            # bad code but it works for now
            # it basically packs padded and unpad them....
            targets = rnn.pack_padded_sequence(data['label'], lengths=data['seq_length'], batch_first=True, enforce_sorted=False)
            targets,_ = rnn.pad_packed_sequence(targets, batch_first=True)
            
            
            #print(output.size())
            #print(targets.size())
            
            
            #loss = loss_function(output, targets)
            #loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(),1.)
            
            try: 
                loss = loss_function(output, targets)
                loss.backward()
            except:
                print('something broken')
                #with autograd.detect_anomaly():
                #    inputs = (data['datum'], data['seq_length'])
                #    output,_ = model(inputs)
                #    #targets = rnn.unpad_sequence(data['label'],data['seq_length'], batch_first=True)
                #    loss = loss_function(output, targets)
                #    loss.backward()
                    
                #print(output.max())
                break
            
            optimizer.step()
            acc_loss += loss.item()

        test_model(model, valid_loader)
            
        print(f'acculumative loss: {acc_loss/len(train_loader)}')
    

In [28]:
train_model(model, loss_function, optimizer, train_loader, valid_loader, 100)

training epoch:1
test acc: 0.03506435863293387
acculumative loss: 0.8917982512050204
training epoch:2
test acc: 0.03506435863293387
acculumative loss: 0.8889279166857401
training epoch:3
test acc: 0.03657407407407407
acculumative loss: 0.8852854702207777
training epoch:4
test acc: 0.03627180899908173
acculumative loss: 0.8828771776623197
training epoch:5
test acc: 0.03506435863293387
acculumative loss: 0.878672911061181
training epoch:6
test acc: 0.03506435863293387
acculumative loss: 0.8750120997428894
training epoch:7
test acc: 0.03627180899908173
acculumative loss: 0.871995316611396
training epoch:8


KeyboardInterrupt: 

# Partition File

In [35]:
import pandas as pd

only firmicutes no validations

In [None]:
partition_frame = pd.read_csv('partition_file/HGTDB_firmicutes.csv')

In [125]:
partition_frame

Unnamed: 0,file,partition
0,cperf,test
1,tteng,test
2,mgen,test
3,mpneu,test
4,mpul,test
5,uure,test
6,cglu,train
7,mtub,train
8,mtub2,train
9,mlep,train


In [120]:
hgtdb_train[5]

{'datum': tensor([[0.6340, 0.4471, 0.4516, 0.5343],
         [0.6404, 0.4801, 0.3985, 0.5286],
         [0.7404, 0.4107, 0.5901, 0.6314],
         ...,
         [0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000]], dtype=torch.float64),
 'seq_length': 4111,
 'label': tensor([0., 0., 0.,  ..., 0., 0., 0.], dtype=torch.float64)}

In [30]:
import os

In [31]:
ROOT_FOLDER = "data/HGTDB/preprocessed_data"

In [32]:
downloaded_files = []
for path in os.listdir(ROOT_FOLDER):
    # check if current path is a file
    if os.path.isfile(os.path.join(ROOT_FOLDER, path)):
        downloaded_files.append(path.replace('.csv', ''))

In [33]:
training_eidx = int(6/8*len(downloaded_files))
valid_eidx = int(7/8*len(downloaded_files))
test_eidx = int(8/8*len(downloaded_files))

partition_list_of_dict = []
for i,f in enumerate(downloaded_files):
    
    if i < training_eidx:
        partition = 'train'
    elif i>= training_eidx and i<valid_eidx:
        partition = 'valid'
    else:
        partition = 'test'
    temp = {
        "file" : f,
        "partition": partition
    }
    partition_list_of_dict.append(temp)

In [34]:
partition_list_of_dict

[{'file': 'spneu1', 'partition': 'train'},
 {'file': 'llac', 'partition': 'train'},
 {'file': 'ypestis', 'partition': 'train'},
 {'file': 'fnucl', 'partition': 'train'},
 {'file': 'bbur', 'partition': 'train'},
 {'file': 'sent', 'partition': 'train'},
 {'file': 'sau2', 'partition': 'train'},
 {'file': 'cjen', 'partition': 'train'},
 {'file': 'bsub', 'partition': 'train'},
 {'file': 'atum2c1', 'partition': 'train'},
 {'file': 'ecoli3', 'partition': 'train'},
 {'file': 'vvul2c2', 'partition': 'train'},
 {'file': 'bhal', 'partition': 'train'},
 {'file': 'xcamp', 'partition': 'train'},
 {'file': 'vcolc2', 'partition': 'train'},
 {'file': 'cpneu', 'partition': 'train'},
 {'file': 'xcitri', 'partition': 'train'},
 {'file': 'sau3', 'partition': 'train'},
 {'file': 'vvul1c2', 'partition': 'train'},
 {'file': 'pmul', 'partition': 'train'},
 {'file': 'synecho', 'partition': 'train'},
 {'file': 'rconorii', 'partition': 'train'},
 {'file': 'bmelic1', 'partition': 'train'},
 {'file': 'dra1', 'parti

In [36]:
df = pd.DataFrame.from_dict(partition_list_of_dict)

In [39]:
df.to_csv('HGTDB_ALL_trisplit.csv', index=False)

In [84]:
10%0

ZeroDivisionError: integer modulo by zero