<h3> Importing Libraries </h3>

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

<h3> Hyper Parameters </h3>

In [12]:
batch_size = 1
embedding_size = 5
window = 1
positive_number = 1000
negative_number = 1000
hidden_layer_size = 100
num_layers = 2
epochs = 800
lr = 0.02

<h3> Loading +/- Data </h3>

In [13]:
positive_data = pd.read_csv('../deep_annotator_data/positive_sample.txt', header=None, nrows=positive_number)
positive_data.columns = ["Gene"]
negative_data = pd.read_csv('../deep_annotator_data/negative_sample.txt', header=None, nrows=negative_number)
negative_data.columns = ["Gene"]
data_ = pd.concat([positive_data, negative_data])
positive_test_data = pd.read_fwf('positive_sample_test.txt', header = None)
positive_test_data.columns = ["Gene"]
negative_test_data = pd.read_fwf('negative_sample_test.txt', header = None)
negative_test_data.columns = ["Gene"]
data_test = pd.concat([positive_test_data, negative_test_data])

fc_layer_size = (len(positive_data.Gene[0])-(window-1))*embedding_size

<h3> Generate word IDs <h3>

In [14]:
strings = set()
def allLexicographicRecur (string, data, last, index): 
    length = len(string)
    for i in range(length): 
        data[index] = string[i] 
        if index==last:
            res = ''.join(data)
            strings.add(res)
        else: 
            allLexicographicRecur(string, data, last, index+1) 
def allLexicographic(string, n): 
    length = len(string)
    data = [""] * (length+1)
    string = sorted(string) 
    allLexicographicRecur(string, data, window-1, 0)
string = "01234"
allLexicographic(string, window)
strings = sorted(strings)
vocabulary = {}
for val, i in enumerate(strings):
    vocabulary[i] = val

{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4}


<h3> Generate Word Embeddings </h3>

In [15]:
negative_labels = torch.zeros(positive_number, 1)
positive_labels = torch.ones(negative_number, 1)
labels_ = torch.cat([positive_labels, negative_labels], dim=0)

<h3> Neural Network Layer Implementation </h3>

In [16]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.embeds = nn.Embedding(len(vocabulary), embedding_size)
        self.lstm = nn.LSTM(fc_layer_size, fc_layer_size, num_layers)
        self.fc1 = nn.Linear(fc_layer_size, hidden_layer_size)
        self.dropout = nn.Dropout(p=0.4)
        self.relu1 = F.relu
        self.out = nn.Linear(hidden_layer_size, 1)
        self.out_act = nn.Sigmoid()

    def forward(self, x):
        z = self.embeds(x).view((1,-1))
        h = autograd.Variable(torch.randn(num_layers, 1, fc_layer_size))
        c = autograd.Variable(torch.randn(num_layers, 1, fc_layer_size))
        z.unsqueeze_(0)
        z = z.expand(1, 1, fc_layer_size)
        out_lstm, hn = self.lstm(z, (h, c))
        a1 = self.fc1(out_lstm)
        d1 = self.dropout(a1)
        h1 = self.relu1(a1)
        a3 = self.out(h1)
        y = self.out_act(a3)
        return y
net = Net()

<h4> Optimizer step and loss calculation </h4>

In [17]:
opt = optim.SGD(net.parameters(), lr, momentum=0.0)
criterion = nn.BCELoss()

<h4> Train method </h4>

In [18]:
def train_epoch(model, opt, criterion, batch_size=1):
    model.train()
    losses = []
    c = 0
    wrong, correct = 0, 0
    for data in data_.itertuples():
        
        data_batch = torch.tensor([vocabulary[data.Gene[i:i+window]] for i in range(0, len(data.Gene) - window + 1)], dtype=torch.long)
        labels_batch = labels_[c]
        c+=1
        data_batch = autograd.Variable(data_batch)
        labels_batch = autograd.Variable(labels_batch)
        opt.zero_grad()
        labels_hat = net(data_batch)
        loss = criterion(labels_hat, labels_batch)
        loss.backward()
        opt.step()        
        losses.append(loss.data.numpy())
        correct, wrong = get_train_accuracy(labels_hat, c, len(labels_), correct, wrong)
    loss = sum(losses)/len(losses)
    return loss,correct,wrong

In [19]:
def train():
    losses = []
    accuracies = []
    test_acc = []
    for e in tqdm(range(epochs)):
        loss, c, w = train_epoch(net, opt, criterion, batch_size)
        accuracies.append(100*(c/(c+w)))
        losses.append(loss)
        if e % 20 == 0:
            print('Average Loss at epoch,',e,':',loss)
        torch.save(net.state_dict(), 'fc_with_lstm.pt')
        correct, wrong = load_test_model()
        test_acc.append(100*(correct/(correct+wrong)))
    return accuracies, test_acc

<h4> Train Accuracy </h4>

In [20]:
def get_train_accuracy(label, index, data_size, correct, wrong):    
    if index < data_size/2+1:
        if label > 0.5:
            correct += 1
        else:
            wrong += 1
    else:
        if label > 0.5:
            wrong += 1
        else:
            correct += 1
    return correct, wrong

<h4> Test Accuracy </h4>

In [21]:
def load_test_model():
    #load model
    model = Net()
    model.load_state_dict(torch.load('fc_with_lstm.pt'))
    model.eval()
    # load data
    labels_predicted = []
    return test_prediction_model(data_test, labels_predicted)

In [22]:
def test_prediction_model(data_test, labels_predicted):
    correct, wrong = 0, 0
    for data in data_test.itertuples():
        data_testing = torch.tensor([vocabulary[data.Gene[i:i+window]] for i in range(0, len(data.Gene) - window + 1)], dtype=torch.long)
        labels_hat = net(data_testing)
        labels_predicted.append(labels_hat[0])
    for i in range(len(labels_predicted)//2 + 1):
        if labels_predicted[i] > 0.5:
            correct += 1
        else:
            wrong += 1
    for i in range(101, len(labels_predicted)):
        if labels_predicted[i] > 0.5:
            wrong += 1
        else:
            correct += 1
    return correct,wrong
    

In [23]:
def test_prediction_accuracy(labels):
    correct, wrong = 0, 0
    labels_hat = labels
    for i in range(len(labels_hat)//2 + 1):
        if labels_hat[i] > 0.5:
            correct += 1
        else:
            wrong += 1
    for i in range(101, len(labels_hat)):
        if labels_hat[i] > 0.5:
            wrong += 1
        else:
            correct += 1
    return (correct,wrong)

In [None]:
acc,test = train()
print('Test Accuracy:', test[-1])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.plot(test)
plt.plot(acc)
plt.gca().legend('Train', 'Test')
plt.title('Train and Test Plot')

HBox(children=(IntProgress(value=0, max=800), HTML(value='')))

  "Please ensure they have the same size.".format(target.size(), input.size()))


Average Loss at epoch, 0 : 0.20894982039090246
Average Loss at epoch, 20 : 0.027336097672459935
Average Loss at epoch, 40 : 0.017930659443009284
Average Loss at epoch, 60 : 0.0012133881951050683
Average Loss at epoch, 80 : 0.0018187688321592078
Average Loss at epoch, 100 : 0.0016468489405140972
Average Loss at epoch, 120 : 0.00030879225454438596
Average Loss at epoch, 140 : 9.483403942549273e-05
Average Loss at epoch, 160 : 1.534185289601453e-05
Average Loss at epoch, 180 : 8.779977748815782e-06
Average Loss at epoch, 200 : 1.2972289334733488e-05
Average Loss at epoch, 220 : 2.1675792483648593e-06
Average Loss at epoch, 240 : 5.653464590608337e-06
Average Loss at epoch, 260 : 2.263272140730521e-06
Average Loss at epoch, 280 : 1.8478656053630972e-06
Average Loss at epoch, 300 : 1.145303193347047e-06
Average Loss at epoch, 320 : 7.505099087808275e-06
Average Loss at epoch, 340 : 1.933587360078093e-06
Average Loss at epoch, 360 : 3.257642030973784e-06
Average Loss at epoch, 380 : 1.550097