In [1]:
from __future__ import print_function, division
import os
import torch
import pandas as pd # For csv
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import collections
import requests
import pickle

## Protein Disorder Prediction

### Data set-up

Import drive, so that DisProt.tsv can be read (assuming downloaded). Future adaption to use API to request TSV.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Amino Acids Channeled Image

In [3]:
def make_empty_image(seq):
  img = {
    'A' : np.zeros(len(seq)),
    'C' : np.zeros(len(seq)),
    'D' : np.zeros(len(seq)),
    'E' : np.zeros(len(seq)),
    'F' : np.zeros(len(seq)),
    'G' : np.zeros(len(seq)),
    'H' : np.zeros(len(seq)),
    'I' : np.zeros(len(seq)),
    'K' : np.zeros(len(seq)),
    'L' : np.zeros(len(seq)),
    'M' : np.zeros(len(seq)),
    'N' : np.zeros(len(seq)),
    'P' : np.zeros(len(seq)),
    'Q' : np.zeros(len(seq)),
    'R' : np.zeros(len(seq)),
    'S' : np.zeros(len(seq)),
    'T' : np.zeros(len(seq)),
    'V' : np.zeros(len(seq)),
    'W' : np.zeros(len(seq)),
    'Y' : np.zeros(len(seq))
  }
  return img

def make_image(seq):
  # Makes 20 empty channels
  channeled_img = make_empty_image(seq)
  # Loop over each amino acid in the sequence - 
  # for its position add a 1 to the letter identifier channel
  for i, char in enumerate(seq):
    # Updates array due to arrays being like pointers
    channeled_img.get(char)[i] = 1

  return channeled_img

In [4]:
# Data from DisProt TSV - https://disprot.org/download
data_disprot = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DL-DISS/DisProt_v1.tsv', sep='\t')

In [5]:
# Dictionary for important data from DisProt
disorder_start_and_end = {}

for i, acc in enumerate(data_disprot['acc']):
  s = data_disprot['start'][i]
  e = data_disprot['end'][i]
  arr = disorder_start_and_end.get((str(acc)), [])

  if (s, e) not in arr:
    disorder_start_and_end[str(acc)] = arr + [(s, e)]


In [6]:
# Create new table for important DisProt data
data = {'acc': disorder_start_and_end.keys(), 'disordered_regions': disorder_start_and_end.values()}

In [7]:
pd.DataFrame.from_dict(data)

Unnamed: 0,acc,disordered_regions
0,P03265,"[(294, 334), (454, 464)]"
1,P49913,"[(134, 170)]"
2,P03045,"[(1, 107), (1, 22), (34, 47), (1, 36)]"
3,P00004,"[(1, 104), (2, 105)]"
4,P27695,"[(1, 42), (1, 36), (32, 43), (2, 40)]"
...,...,...
2414,A0A5P2U9X4,"[(350, 525), (460, 521), (417, 426), (450, 525)]"
2415,P40939,"[(637, 647)]"
2416,Q6CSX2,"[(562, 831)]"
2417,Q8IYT8,"[(168, 177)]"


In [8]:
pandas_data = pd.DataFrame.from_dict(data)

The proteins in this dataframe are preprocessed to get their full sequences from UniProt.

In [None]:
# Preprocessing - download all sequences.
def preprocess_sequences(pandas_data):
  protSeqDict = {}
  for row in range(len(pandas_data)):
    acc = pandas_data['acc'].loc[row]

    url = f'https://www.uniprot.org/uniprotkb/{str(acc)}.fasta'
    uniprot_fasta = requests.get(url).text
    # Gets the sequence as a string of amino acids
    protein_sequence = uniprot_fasta.split('\n')[1:]
    protein_sequence = ''.join(protein_sequence)

    if protein_sequence == '':
      continue

    protSeqDict[acc] = protein_sequence
  return protSeqDict

protein_sequences_n_ids = preprocess_sequences(pandas_data)

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/DL-DISS/uniSeqData.txt', 'wb') as outfile:
    pickle.dump(protein_sequences_n_ids, outfile)

Quick access to preprocessed data, instead of downloading it each time Notebook is opened.

In [9]:
with open('/content/drive/My Drive/Colab Notebooks/DL-DISS/uniSeqData.txt', 'rb') as infile:
    protein_sequences_n_ids = pickle.load(infile)

Removing protein data that is incompatible with my solution.

In [11]:
clean_pandas_data = pandas_data
x = protein_sequences_n_ids.keys()
for acc in pandas_data['acc']:
  if acc in x:
    continue
  else:
    index_to_drop = clean_pandas_data[clean_pandas_data['acc'] == acc].index.tolist()[0]
    clean_pandas_data = clean_pandas_data.drop(index_to_drop)

In [12]:
for acc in clean_pandas_data['acc']:
  seqq = protein_sequences_n_ids.get(acc)
  if 'X' in seqq or 'U' in seqq or 'Z' in seqq:
    index_to_drop = clean_pandas_data[clean_pandas_data['acc'] == acc].index.tolist()[0]
    clean_pandas_data = clean_pandas_data.drop(index_to_drop)

print(len(clean_pandas_data))

2409


In [13]:
clean_data = {'acc': (clean_pandas_data['acc'].tolist()), 'disordered_regions': (clean_pandas_data['disordered_regions'].tolist())}
fully_clean_pandas_data = pd.DataFrame.from_dict(clean_data)

### Dataset class for our data. 
- Takes in pandas data (usually full TSV).
- The amino acid vectorising map.
- A dictionary mapping protein accession numbers to their sequence (generated from preprocessing).

In [15]:
class DisProtDataset(Dataset):
    def __init__(self, pandas_table, amino_map, protein_sequences, transform=None):
        self.disorder_prot = pandas_table
        self.sequence_map = make_image
        self.sequences = protein_sequences
        self.tranform = transform

    def __len__(self):
        return len(self.disorder_prot)

    def __getitem__(self, idx):
        # Protein accession number - key identifier
        acc = self.disorder_prot['acc'].loc[idx]
        idrs = self.disorder_prot['disordered_regions'].loc[idx]
        
        # Get sequence
        protein_sequence = self.sequences.get(acc)
        # Vectorise amino acids
        protein_sequence_image = self.sequence_map(protein_sequence)
        # Converts channel dictionary to 2D array
        protein_sequence_image = np.array(list(protein_sequence_image.values()))
        
        # Create order/disorder label
        disorder_label = np.zeros(len(protein_sequence))
        for (start, end) in idrs:
          disorder_label[start-1:end] = 1

        get_dict = {'acc': acc, 'image': protein_sequence_image, 'label': disorder_label}
        return get_dict

In [16]:
# Instantiates instance variable of DisProtDataset class
formed_dataset_class = DisProtDataset(fully_clean_pandas_data, make_image, protein_sequences_n_ids)

In [17]:
# Can test output of class
formed_dataset_class[0]

{'acc': 'P03265', 'image': array([[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), 'label': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0

In [18]:
(formed_dataset_class[0].get('image')).shape

(20, 529)

Create a dataloader using the instance variable.

In [19]:
dataloader = DataLoader(formed_dataset_class, batch_size=1,
                        shuffle=True, num_workers=0)

# Check data loaded
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch)
    print(sample_batched)
    # observe 4th batch and stop.
    if i_batch == 3:
        break

0
{'acc': ['P40517'], 'image': tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], dtype=torch.float64), 'label': tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1.,

### Creates a data iterator.
- This is similar to the dataloader.
- After the item has been used it is removed from the 'iterator'.
- Can randomly shuffle items about. But removes them after used, so doesn't accidentally repeat sequences.

In [20]:
# Testing how to get samples from dataloader using dataiter
dataiter = iter(dataloader)
for i, sam in enumerate(dataiter):
  print(sam)
  if i == 2:
    break

acc, image, label = next(dataiter).values()

{'acc': ['A0A3S5Y0Q5'], 'image': tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], dtype=torch.float64), 'label': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0

### Working with a PyTorch NN. FCN model

In [28]:
# Uses 1D Convolution kernels

model = nn.Sequential(collections.OrderedDict([
          ('conv1', nn.Conv1d(20, 10, 21, padding=10)),
          ('relu1', nn.ReLU()),
          ('conv2', nn.Conv1d(10, 10, 21, padding=10)),
          ('relu2', nn.ReLU()),
          ('conv3', nn.Conv1d(10, 1, 21, padding=10)),
          ('sig1', nn.Sigmoid())
        ]))

In [29]:
# Reload the dataiter - to ensure all rows are in the iterator when iterator is looped through
dataiter = iter(dataloader)

### Training model, given DisProt dataset

In [30]:
import datetime
epoch_print_gap = 1

def training_loop(n_epochs, optimizer, model, loss_fn, train_loader):
    #model = model.to(device)
    for epoch in range(1, n_epochs + 1):
        
        running_loss_train = 0.0

        # Where i is a counter and sam is a dictionary
        for i, sam in enumerate(train_loader):
          acc, image, label = sam.values() #sam['acc'], sam['image'], sam['label']
          NN_input = image.type(torch.FloatTensor)
          expected_output = label.type(torch.FloatTensor)

          output = model(NN_input)
          squeezed_o = torch.squeeze(output)
          squeezed_e_o = torch.squeeze(expected_output)

          loss = loss_fn(squeezed_o, squeezed_e_o)          
          loss.backward()
          running_loss_train += loss.item()

          # This has effect of batches of size 16.
          if (i+1) % 16 == 0:        
            optimizer.step()
            # Zero the parameter gradients
            # Note - this zeros anything accumulated by loss.backward()
            optimizer.zero_grad()
          
          # Print loss throughout epoch
          if (i+1) % 500 == 0:
            print("Epoch: "+str(epoch), end=" ")
            print("Current loss: "+str(running_loss_train / 500))
            running_loss_train = 0.0

        if epoch == 1 or epoch % epoch_print_gap == 0:
            print("Epoch", epoch, "Done \n\n")


# Main
lamb=0.001    # L2 weight decay term
lr = 0.001
epochs = 3
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=lamb) # This is adding L2 regularisation with factor lamb to every parameter
training_loop(epochs, optimizer, model, criterion, dataloader)


Epoch: 1 Current loss: 0.6383503780364991
Epoch: 1 Current loss: 0.5868662053346634
Epoch: 1 Current loss: 0.5633245394527913
Epoch: 1 Current loss: 0.5637900549471379
Epoch 1 Done 


Epoch: 2 Current loss: 0.531167193904519
Epoch: 2 Current loss: 0.516006025493145
Epoch: 2 Current loss: 0.5040229924917221
Epoch: 2 Current loss: 0.5189556149691343
Epoch 2 Done 


Epoch: 3 Current loss: 0.5107801219820977
Epoch: 3 Current loss: 0.5010962297767401
Epoch: 3 Current loss: 0.5087641105353832
Epoch: 3 Current loss: 0.510286563128233
Epoch 3 Done 


