In [1]:
from __future__ import print_function, division
import os
import torch
import pandas as pd # For csv
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import collections
import requests

## Protein Disorder Prediction

### Data set-up

Import drive, so that DisProt.tsv can be read (assuming downloaded). Future adaption to use API to request TSV.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Amino Acids Map

In [3]:
# Vectorised amino acids
amino_map = {
    'A' : (1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
    'C' : (0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
    'D' : (0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
    'E' : (0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
    'F' : (0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
    'G' : (0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
    'H' : (0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
    'I' : (0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0),
    'K' : (0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0),
    'L' : (0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0),
    'M' : (0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0),
    'N' : (0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
    'P' : (0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0),
    'Q' : (0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0),
    'R' : (0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0),
    'S' : (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0),
    'T' : (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0),
    'V' : (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0),
    'W' : (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0),
    'Y' : (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0),
    # Buffer amino acid
    '*' : (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1)
}

In [4]:
# Data from DisProt TSV - https://disprot.org/download
data_disprot = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DL-DISS/DisProt_v1.tsv', sep='\t')

In [5]:
# Dictionary for important data from DisProt
disorder_start_and_end = {}

for i, acc in enumerate(data_disprot['acc']):
  s = data_disprot['start'][i]
  e = data_disprot['end'][i]
  arr = disorder_start_and_end.get((str(acc)), [])

  if (s, e) not in arr:
    disorder_start_and_end[str(acc)] = arr + [(s, e)]


In [6]:
# Create new table for important DisProt data
data = {'acc': disorder_start_and_end.keys(), 'disordered_regions': disorder_start_and_end.values()}

In [7]:
pd.DataFrame.from_dict(data)

Unnamed: 0,acc,disordered_regions
0,P03265,"[(294, 334), (454, 464)]"
1,P49913,"[(134, 170)]"
2,P03045,"[(1, 107), (1, 22), (34, 47), (1, 36)]"
3,P00004,"[(1, 104), (2, 105)]"
4,P27695,"[(1, 42), (1, 36), (32, 43), (2, 40)]"
...,...,...
2414,A0A5P2U9X4,"[(350, 525), (460, 521), (417, 426), (450, 525)]"
2415,P40939,"[(637, 647)]"
2416,Q6CSX2,"[(562, 831)]"
2417,Q8IYT8,"[(168, 177)]"


In [8]:
pandas_data = pd.DataFrame.from_dict(data)

The proteins in this dataframe are preprocessed to get their full sequences from UniProt.

In [9]:
# Preprocessing - download all sequences.
def preprocess_sequences(pandas_data):
  protSeqDict = {}
  for row in range(len(pandas_data)):
    acc = pandas_data['acc'].loc[row]

    url = f'https://www.uniprot.org/uniprotkb/{str(acc)}.fasta'
    uniprot_fasta = requests.get(url).text
    # Gets the sequence as a string of amino acids
    protein_sequence = uniprot_fasta.split('\n')[1:]
    protein_sequence = ''.join(protein_sequence)
    protSeqDict[acc] = protein_sequence
  return protSeqDict

protein_sequences_n_ids = preprocess_sequences(pandas_data)

### Dataset class for our data. 
- Takes in pandas data (usually full TSV).
- The amino acid vectorising map.
- A dictionary mapping protein accession numbers to their sequence (generated from preprocessing).

In [10]:
class DisProtDataset(Dataset):
    def __init__(self, pandas_table, amino_map, protein_sequences, transform=None):
        self.disorder_prot = pandas_table
        self.sequence_map = amino_map
        self.sequences = protein_sequences
        self.tranform = transform

    def __len__(self):
        return len(self.disorder_prot)

    def __getitem__(self, idx):
        # Protein accession number - key identifier
        acc = self.disorder_prot['acc'].loc[idx]
        idrs = self.disorder_prot['disordered_regions'].loc[idx]

        # Get sequence
        protein_sequence = self.sequences.get(acc)
        # Vectorise amino acids
        protein_sequence_image = np.array([self.sequence_map.get(amino_acid) for amino_acid in protein_sequence])
        # Create order/disorder label
        disorder_label = np.zeros(len(protein_sequence))
        for (start, end) in idrs:
          disorder_label[start-1:end] = 1

        get_dict = {'acc': acc, 'image': protein_sequence_image.T, 'label': disorder_label}
        return get_dict

In [11]:
# Instantiates instance variable of DisProtDataset class
formed_dataset_class = DisProtDataset(pandas_data, amino_map, protein_sequences_n_ids)

In [12]:
# Can test output of class
formed_dataset_class[0]

{'acc': 'P03265', 'image': array([[0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]), 'label': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

Create a dataloader using the instance variable.

In [13]:
dataloader = DataLoader(formed_dataset_class, batch_size=1,
                        shuffle=True, num_workers=0)

# Check data loaded
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch)
    print(sample_batched)
    # observe 4th batch and stop.
    if i_batch == 3:
        break

0
{'acc': ['P45561'], 'image': tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 1],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]]), 'label': tensor([[0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
    

### Creates a data iterator.
- This is similar to the dataloader.
- After the item has been used it is removed from the 'iterator'.
- Can randomly shuffle items about. But removes them after used, so doesn't accidentally repeat sequences.

In [16]:
# Testing how to get samples from dataloader using dataiter
dataiter = iter(dataloader)
for i, sam in enumerate(dataiter):
  print(sam)
  if i == 2:
    break

acc, image, label = next(dataiter).values()

{'acc': ['Q96SD1'], 'image': tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 1, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]]), 'label': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
      

### Working with a PyTorch NN. FCN model

FCN model - presented on 24 November (see meeting notes).

In [17]:
model = nn.Sequential(collections.OrderedDict([
          ('conv1', nn.Conv2d(1, 3, 21, padding=(10,10))),
          ('relu1', nn.ReLU()),
          ('conv2', nn.Conv2d(3, 10, 21, padding=(10,10))),
          ('relu2', nn.ReLU()),
          ('conv3', nn.Conv2d(10, 1, 21, padding=(10,10))),
          ('sig1', nn.Sigmoid()),
          # Without padding we can return a [1,1,LEN output] to compare with label
          ('conv4', nn.Conv2d(1, 1, 21, padding=(0,10))),
          ('sig2', nn.Sigmoid())
        ]))

In [18]:
dataiter = iter(dataloader)

Experimenting with the FCN model and data from the data iterator.

In [19]:
# Where i is a counter (for early stoppage) and sam is a dictionary
for i, sam in enumerate(dataiter):
  acc, image, label = sam.values() #sam['acc'], sam['image'], sam['label']

  NN_input = image.type(torch.FloatTensor)
  expected_output = label.type(torch.FloatTensor)

  print("Model start")
  output = model(NN_input)
  squeezed_o = torch.squeeze(output)
  squeezed_e_o = torch.squeeze(expected_output)

  print(squeezed_o.shape)
  print(squeezed_e_o.shape)
  print("Model end")

  if i == 3:
    break

Model start
torch.Size([125])
torch.Size([125])
Model end
Model start
torch.Size([238])
torch.Size([238])
Model end
Model start
torch.Size([1733])
torch.Size([1733])
Model end
Model start
torch.Size([571])
torch.Size([571])
Model end


In [20]:
# Reload the dataiter - to ensure all rows are in the iterator when iterator is looped through
dataiter = iter(dataloader)

### Training model, given DisProt dataset

In [48]:
import datetime
epoch_print_gap = 1

def training_loop(n_epochs, optimizer, model, loss_fn, train_loader):
    #model = model.to(device)
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0

        # Where i is a counter and sam is a dictionary
        for i, sam in enumerate(train_loader):
          acc, image, label = sam.values() #sam['acc'], sam['image'], sam['label']
          NN_input = image.type(torch.FloatTensor)
          expected_output = label.type(torch.FloatTensor)

          output = model(NN_input)
          squeezed_o = torch.squeeze(output)
          squeezed_e_o = torch.squeeze(expected_output)

          loss = loss_fn(squeezed_o, squeezed_e_o)
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          loss_train += loss.item()

          # To ensure loop runs with no errors. Does not fully train model
          if i == 20:
            break

        if epoch == 1 or epoch % epoch_print_gap == 0:
            print("Epoch", epoch)
            print('{} Epoch {}, Training loss {}'.format(
                datetime.datetime.now(), epoch, float(loss_train)))
            

#main
lamb=0.0000    # L2 weight decay term
lr = 0.01
epochs = 3 # Can upgrade this to 1000s after confident with model
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=lamb) # This is adding L2 regularisation with factor lamb to every parameter
training_loop(epochs, optimizer, model, criterion, dataloader)

# Need to have dataloader passed to training loop. Dataiter will be empty by epoch 2. Possibly call dataiter within loop somewhere - Perhaps each epoch - allows shuffling


Epoch 1
2022-12-12 18:50:45.234124 Epoch 1, Training loss 13.483666241168976
Epoch 2
2022-12-12 18:51:22.150148 Epoch 2, Training loss 13.210659682750702
Epoch 3
2022-12-12 18:51:40.630553 Epoch 3, Training loss 10.536206632852554
