In [2]:
import torch 
from torch import nn
from torch.nn import functional as F
import numpy as np
import pandas as pd
from vocab import Vocab
from dataprocessor import DataProcessor

In [9]:
def get_data(filename):
    data = pd.read_csv(filename)
    return data['protein_sequence'].values , data['tm'].values 

trainx,trainy = get_data('./data/train_fixed.csv')

dp = DataProcessor(trainx,trainy)
x,y = dp.sample(10)
print(x)
print(y)

['MPSSVSWGILLLAGLCCLVPVSLAEDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSNSTNIFFSPVSIATAFAMLSLGTKADTHDEILEGLNFNLTEIPEAQIHEGFQELLRTLNQPDSQLQLTTGNGLFLSEGLKLVDKFLEDVKKLYHSEAFTVNFGDTEEAKKQINDYVEKGTQGKIVDLVKELDRDTVFALVNYIFFKGKWERPFEVKDTEEEDFHVDQVTTVKVPMMKRLGMFNIQHCKKLSSWVLLMKYLGNATAIFFLPDEGKLQHLENELTHDIITKFLENEDRRSASLHLPKLSITGTYDLKSVLGQLGITKVFSNGADLSGVTEEAPLKLSFAVHKAVLTIDEKGTEAAGAMFLEAIPMSIPPEVKFNKPFVFLMIEQNTKSPLFMGKVVNPTQK', 'MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNSKSFADINLYREQIKRVKDSDDVPMVLVGNKCDLPTRTVDTKQAHELAKSYGIPFIETSAKTRQGVEDAFYTLVREIRQYRMKKLNSSDDGTQGCM', 'MSDEEHTFETADAGSSATYPMQCSALRKNGFVVIKSRPCKIVDMSTSKTGKHGHAKVHLVAIDIFTGKKLEDLSPSTHNMEVPVVKRNEYQLLDIDDGFLSLMNMDGDTKDDVKAPEGELGDSLQTAFDEGKDLMVTIISAMGEEAAISFKEAART', 'MFRCRNMVRDNSRNICFGKLAETTTTQQQQQQQQFVVDSSTIINNNNNNNNNNNNQKLKRSTEEPPTNSFERNYYDRTTSRLVTQYQANNSTSLANSNSSPSSVSASASVFATAAGGSSERSRNRDRPYRNGSASVQGGGINSSNTTTTTAACTAGGSGSGAIGTGTGGLVGSGPGGVPQALGDRSSTQNIHQNHQSARVAPPQSWYEAATAATTAQLKSSGGSGNAGAS

In [41]:
class ConvProtein(nn.Module):

    def __init__(self,embeddings, hidden_sizes = [32,64,128,256,512]):
        # say input shape of this protein is in shape [batch_size,100,100,1] 
        # last dim is the acid thing 
        # first embedding it
        nn.Module.__init__(self)

        if isinstance(embeddings , (list,tuple)):   
            n_vocab , emb_dim,*_ = embeddings
            weights = None
        else:
            if type(embeddings).__name__ == 'ndarray':
                embeddings = torch.from_numpy(embeddings).float()

            n_vocab , emb_dim = embeddings.size()
            weights = embeddings
        
        self._embedding = nn.Embedding(n_vocab , emb_dim ,padding_idx=0, _weight = weights)
        
        conv_layers = []
        inp_feat    = emb_dim
        for hs in hidden_sizes:
            layer = nn.Sequential(nn.Conv2d(inp_feat , hs*2 , kernel_size=3,stride=1,padding=1),
                                  nn.ReLU(),
                                  nn.Conv2d(hs*2, hs , kernel_size=3,stride=2,padding=1))
            inp_feat = hs
            conv_layers.append(layer)
        self.conv_layers = nn.ModuleList(conv_layers)

        self._mlp = nn.Sequential(nn.Linear(hidden_sizes[-1] , 100 ), nn.ReLU() , nn.Linear(100,1),nn.Tanh())

    def forward(self, inputs):
        # inputs is in shape [batch_size , 60 , 60 , 1]
        # transpose : [batch , width , length , dim] --> [batch , dim , width , length]
        out = self._embedding(inputs).transpose(3,1)
        for conv in self.conv_layers:
            out = conv(out)
        out = F.adaptive_max_pool2d(out,(1,1))
        out = out.squeeze(-1).squeeze(-1)
        out = self._mlp(out)
        out = out.squeeze(-1)
        return out 

In [42]:
model = ConvProtein([21,32])

inputs = torch.randint(1,21,size=[13,60,60])

out = model(inputs)

print(out)

tensor([0.0886, 0.0887, 0.0886, 0.0887, 0.0886, 0.0887, 0.0888, 0.0887, 0.0887,
        0.0888, 0.0886, 0.0887, 0.0887], grad_fn=<SqueezeBackward1>)
