# Looking at adding_model.py and setup.py

setup.py: One entry point is 'tape-embed = tape.main:run_embed' > training.run_embed: 

```
dataset = task_spec.dataset(data_file, tokenizer=tokenizer) > 
valid_loader = utils.setup_loader(dataset, batch_size, local_rank, n_gpu, 1, num_workers)
```

Ended up at tape.datasets



In [1]:
from tape import datasets
datasets.dataset_factory

<function tape.datasets.dataset_factory(data_file: Union[str, pathlib.Path], *args, **kwargs) -> torch.utils.data.dataset.Dataset>

In [2]:
sprot = datasets.dataset_factory(data_file='../data/uniprot_sprot.fasta')

In [3]:
sprot[1]

{'id': 'sp|Q6GZX3|002L_FRG3G',
 'primary': 'MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCARIKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSLAERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADCKCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNMLDDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRKVMFFVAGAVLVAILISTVRW',
 'protein_length': 320}

In [4]:
import torch
from tape import ProteinOneHotModel, TAPETokenizer, ProteinLSTMModel, ProteinLSTMConfig
tokenizer = TAPETokenizer(vocab='iupac')  # iupac is the vocab for TAPE models, use unirep for the UniRep model
config=ProteinLSTMConfig()

In [5]:
config

{
  "finetuning_task": null,
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "input_size": 128,
  "num_hidden_layers": 3,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "vocab_size": 30
}

In [6]:
model=ProteinLSTMModel(config= config)

In [7]:
model

ProteinLSTMModel(
  (embed_matrix): Embedding(30, 128)
  (encoder): ProteinLSTMEncoder(
    (forward_lstm): ModuleList(
      (0): ProteinLSTMLayer(
        (dropout): Dropout(p=0.0, inplace=False)
        (lstm): LSTM(128, 1024, batch_first=True)
      )
      (1): ProteinLSTMLayer(
        (dropout): Dropout(p=0.1, inplace=False)
        (lstm): LSTM(1024, 1024, batch_first=True)
      )
      (2): ProteinLSTMLayer(
        (dropout): Dropout(p=0.1, inplace=False)
        (lstm): LSTM(1024, 1024, batch_first=True)
      )
    )
    (reverse_lstm): ModuleList(
      (0): ProteinLSTMLayer(
        (dropout): Dropout(p=0.0, inplace=False)
        (lstm): LSTM(128, 1024, batch_first=True)
      )
      (1): ProteinLSTMLayer(
        (dropout): Dropout(p=0.1, inplace=False)
        (lstm): LSTM(1024, 1024, batch_first=True)
      )
      (2): ProteinLSTMLayer(
        (dropout): Dropout(p=0.1, inplace=False)
        (lstm): LSTM(1024, 1024, batch_first=True)
      )
    )
  )
  (pooler): 

In [8]:
# Pfam Family: Hexapep, Clan: CL0536
sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
token_ids = torch.tensor([tokenizer.encode(sequence)])
token_ids

tensor([[ 2, 11,  7, 23, 25,  9,  8, 21,  7, 15, 13, 11, 16, 11,  5, 13, 15, 15,
         17, 11,  7, 25, 13, 11, 22, 11, 22, 15, 25,  5,  5, 11,  5, 15, 13, 23,
         20,  3]])

In [9]:
output = model(token_ids)
sequence_output = output[0]
pooled_output = output[1]

In [10]:
sequence_output

tensor([[[ 0.0110, -0.0019, -0.0101,  ..., -0.0172, -0.0082, -0.0149],
         [ 0.0172, -0.0028, -0.0153,  ..., -0.0170, -0.0087, -0.0157],
         [ 0.0209, -0.0033, -0.0185,  ..., -0.0183, -0.0081, -0.0160],
         ...,
         [ 0.0246, -0.0063, -0.0225,  ..., -0.0184, -0.0034, -0.0124],
         [ 0.0248, -0.0057, -0.0221,  ..., -0.0166, -0.0016, -0.0125],
         [ 0.0241, -0.0060, -0.0226,  ..., -0.0103,  0.0001, -0.0086]]],
       grad_fn=<CatBackward>)

In [11]:
pooled_output

tensor([[ 1.0810e-03,  9.1288e-06,  3.6752e-04,  ..., -3.5943e-04,
          5.1847e-04, -9.5619e-04]], grad_fn=<TanhBackward>)

## One-hot 

In [12]:
from tape import ProteinOneHotModel, ProteinOneHotConfig
tokenizer = TAPETokenizer(vocab='iupac')  # iupac is the vocab for TAPE models, use unirep for the UniRep model
config=ProteinOneHotConfig(vocab_size= 30) # note I need to  specify the vocab size here
config

{
  "finetuning_task": null,
  "initializer_range": 0.02,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "use_evolutionary": false,
  "vocab_size": 30
}

In [13]:
model=ProteinOneHotModel(config = config)
model

ProteinOneHotModel()

In [14]:
output = model(token_ids)
sequence_output = output[0]
pooled_output = output[1] # probably doesn't make sense here

In [15]:
sequence_output

tensor([[[0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [16]:
pooled_output

tensor([[0.0000, 0.0000, 0.0263, 0.0263, 0.0000, 0.1053, 0.0000, 0.0789, 0.0263,
         0.0263, 0.0000, 0.1842, 0.0000, 0.1053, 0.0000, 0.1316, 0.0263, 0.0263,
         0.0000, 0.0000, 0.0263, 0.0263, 0.0526, 0.0526, 0.0000, 0.0789, 0.0000,
         0.0000, 0.0000, 0.0000]])

In [17]:
output

(tensor([[[0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]),
 tensor([[0.0000, 0.0000, 0.0263, 0.0263, 0.0000, 0.1053, 0.0000, 0.0789, 0.0263,
          0.0263, 0.0000, 0.1842, 0.0000, 0.1053, 0.0000, 0.1316, 0.0263, 0.0263,
          0.0000, 0.0000, 0.0263, 0.0263, 0.0526, 0.0526, 0.0000, 0.0789, 0.0000,
          0.0000, 0.0000, 0.0000]]))