In [48]:
import os

import numpy as np
import pandas as pd
import torch
import torchtext

from torch.utils.data import Dataset
from torch.nn.utils.rnn import pack_padded_sequence

In [2]:
TRAINDIR = os.path.join(os.getcwd(), "data", "train")

In [4]:
pfam_train_df = pd.read_csv(os.path.join(TRAINDIR, "data-00000-of-00080"))
pfam_train_df.head()

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,GMC_oxred_C,A4WZS5_RHOS5/416-539,PF05199.13,PHPE.SRIRLST.RRDAHGMP.....IP.RIESRLGP............,PHPESRIRLSTRRDAHGMPIPRIESRLGPDAFARLRFMARTCRAIL...
1,DUF2887,K9QI92_9NOSO/3-203,PF11103.8,RDSIYYQIFKRFPALIFEL..VD.NRPPQAQNYRFESVEVKETAFR...,RDSIYYQIFKRFPALIFELVDNRPPQAQNYRFESVEVKETAFRIDG...
2,zf-IS66,Q92LC9_RHIME/32-75,PF13005.7,.TCCPDCGG.E..LRLVGED.AS....EILDMI.AAQMKVIEVARL...,TCCPDCGGELRLVGEDASEILDMIAAQMKVIEVARLKKSCRCCE
3,Asp_decarbox,X2GQZ4_9BACI/1-115,PF02261.16,MLRMMMNSKIHRATVTEADLNYVGSITIDEDILDAVGMLPNEKVHI...,MLRMMMNSKIHRATVTEADLNYVGSITIDEDILDAVGMLPNEKVHI...
4,Filamin,A7SQM3_NEMVE/342-439,PF00630.19,TACPKQ.CTA....RGLG.............LK.AAPVT.QPT..R...,TACPKQCTARGLGLKAAPVTQPTRFVVILNDCHGQPLGRSEGELEV...


In [7]:
ALPHABET = "ARNDCQEGHILKMFPSTWYV"
AMINOACIDS = {aa: idx for idx, aa in enumerate(ALPHABET)}

In [10]:
def encode_sequence(sequence, aminoacid_lookup, N=70):
    encoded_sequence = np.zeros(len(sequence), dtype=int) - 1
    sequence_to_index = np.array([aminoacid_lookup.get(aa, 21) for aa in sequence])
    sequence_length = min(N, len(sequence_to_index))
    encoded_sequence[:sequence_length] = sequence_to_index[:sequence_length]

    return encoded_sequence, sequence_length

In [21]:
pfam_train_df["encoded_sequence"] = pfam_train_df["sequence"].apply(lambda s: encode_sequence(s, AMINOACIDS, 600)[0])
pfam_train_df["sequence_length"] = pfam_train_df["sequence"].apply(len)

In [45]:
class PfamDataset(Dataset):
    def __init__(self, sequences, family_id):
        self.sequences = sequences
        self.family_id = family_id

    def __len__(self):
        return len(self.family_id)

    def __getitem__(self, idx):
        return torch.from_numpy(self.sequences[idx].astype(np.int32)), self.family_id[idx], len(self.sequences[idx])

In [47]:
train_df = PfamDataset(pfam_train_df["sequence"], pfam_train_df["family_id"])

In [None]:
class LSTM(torch.nn.Module) :
    def __init__(self, embedding_dim=8, hidden_dim=5, dropout_ratio=0.3) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_ratio)
        self.embeddings = nn.Embedding(21, embedding_dim, padding_idx=21)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)

    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [32]:
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=lr)
for i in range(epochs):
    model.train()
    sum_loss = 0.0
    total = 0
    for x, y, l in train_dl:
        x = x.long()
        y = y.long()
        y_pred = model(x, l)
        optimizer.zero_grad()
        loss = F.cross_entropy(y_pred, y)
        loss.backward()
        optimizer.step()
        sum_loss += loss.item()*y.shape[0]
        total += y.shape[0]
    val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
    if i % 5 == 1:
        print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

array([0.0085885])