# a deep-learning classifier for subcellular localization of proteins

This example is inspired from : https://github.com/vanessajurtz/lasagne4bio/blob/master/subcellular_localization/notebook%20tutorial/FFN.ipynb

Based on the dataset from : https://academic.oup.com/bioinformatics/article/33/21/3387/3931857



## meet the data

In [None]:
## on google colab, you will have to run the following line:
#!pip install pytorch-model-summary

In [1]:
from collections import defaultdict , Counter

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
import pytorch_model_summary as pms 

from torch.utils.data import TensorDataset, DataLoader

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")


Using cpu device


In [None]:
## on google colab you will have to download 
# !wget https://github.com/sib-swiss/pytorch-practical-training/raw/refs/heads/master/data/subcellular_localization/reduced_train.npz
## and
# !wget https://github.com/sib-swiss/pytorch-practical-training/raw/refs/heads/master/data/subcellular_localization/reduced_val.npz

## and adapt the following cell to open "reduced_train.npz" instead of 'data/subcellular_localization/reduced_train.npz'
## as well as "reduced_val.npz"

In [4]:
# Load the encoded protein sequences, and labels 
train = np.load('data/subcellular_localization/reduced_train.npz' , allow_pickle=True)
X_train = train['X_train']
y_train = train['y_train']

print(X_train.shape)

(2423, 400, 20)


In [None]:
validation = np.load('data/subcellular_localization/reduced_val.npz')
X_valid = validation['X_val']
y_valid = validation['y_val']

print(X_valid.shape)

In [None]:
classes = ['Nucleus',
           'Cytoplasm',
           'Extracellular',
           'Mitochondrion',
           'Cell membrane',
           'ER',
           'Chloroplast',
           'Golgi apparatus',
           'Lysosome',
           'Vacuole']

dico_classes_subcell={i:v for i,v in enumerate(classes)}

for i in dico_classes_subcell.keys():
    print('Target', i, dico_classes_subcell[i])

Let's look at the target categories

In [None]:
sns.countplot(y = [ dico_classes_subcell[y] for y in  y_train ] + [ dico_classes_subcell[y] for y in  y_valid ] ,
              hue = ['train']*len(y_train) + ['val']*len(y_valid),
              order = classes )

Each sequence is encoded as a matrix where each position is a row of size 20, for each possible amino-acid.

The values withing the matrix represent the amino acid frequency at the given position.

Naturally, on one hand the proteins have different sizes, but on the other our neural network will require a fixed input size, thus, each position after the last contains only zeroes.   

In [None]:
X_train[0,]

In [None]:
X_train[0,3]

In [None]:
sns.heatmap( X_train[0,] )

In [None]:
X_train[0,0:16]

## Thinking about the problem, the loss, and the constraints it poses

In the previous model we only had 2 classes in our target, thus our model output could merely be the probability  of belonging to class 1.
Furthermore, our loss function was `nn.BCELoss`, which stands in for **Binary** Cross Entropy.

However now our output target has 10 classes.

Thus our output will have to be different, and our loss function will have to be as well.


**exercise**: With the help of the [CEloss doc](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss), try to work out what the model output should be? how the target should be encoded? what parameters of the loss function may be of interest to us?



In [None]:
# help : https://pytorch.org/docs/stable/tensors.html
## this is how one can create a tensor of floats:

example = torch.DoubleTensor([1,2,3])
example

In [None]:
CEloss = nn.CrossEntropyLoss()

pred = ...

target = ...

CEloss(pred , target)

In [None]:
target

---

correction:

making the loss work:

In [None]:
# %load -r 1-12 solutions/classifier_CEloss.py

additionnal considerations:

In [None]:
# %load -r 13-27 solutions/classifier_CEloss.py

In [None]:
# %load -r 28- solutions/classifier_CEloss.py

## build the data loaders

In [None]:
batch_size = 128

In [None]:
# transform to torch tensor
X_train_tensor = torch.Tensor(X_train) 
y_train_tensor = torch.LongTensor(y_train)

# create your dataset
train_dataset = TensorDataset(X_train_tensor,y_train_tensor) 

## creating a dataloader
train_dataloader = DataLoader(train_dataset , batch_size = batch_size ) 

In [None]:
# transform to torch tensor
X_valid_tensor = torch.Tensor(X_valid) 
y_valid_tensor = torch.LongTensor(y_valid)

# create your dataset
valid_dataset = TensorDataset(X_valid_tensor,y_valid_tensor) 

## creating a dataloader
valid_dataloader = DataLoader(valid_dataset , batch_size = batch_size )

## building a model

Architecture:
 - flatten
 - hidden layer
     - linear 
     - [dropout](https://ml-cheatsheet.readthedocs.io/en/latest/regularization.html?highlight=dropout#dropout)
     - ReLu activation
 - output layer 
     - linear 
     
sizes:
 - input size = 8000
 - hidden size = [80]
 - output size = number of classes = 10


In [None]:

class ProteinLoc_neuralNet(torch.nn.Module):
    
    def __init__(self , input_dim = 8000 , 
                         hidden_dim=[80] ,
                         output_dim = 10 , 
                         dropout_fraction = 0.25):
        super().__init__()
        
        ## we transform the input from 2D to 1D
        self.flatten = nn.Flatten()
        
        elements = []
        # each layer is made of a linear layer with a ReLu activation and a DropOut Layer
        for i in range(len(hidden_dim)):
            
            elements.append( nn.Linear(input_dim, hidden_dim[i]) )
            elements.append( nn.ReLU() )
            elements.append( nn.Dropout(dropout_fraction) )
            
            input_dim = hidden_dim[i] ## update the input dimension for the next layer
        
        elements.append( nn.Linear(input_dim, output_dim) )

        self.layers = nn.Sequential( *elements )
        
    def forward(self, x):
        x = self.flatten(x)
        ## NB: here, the output of the last layer are logits
        logits = self.layers(x)
        return logits


model = ProteinLoc_neuralNet( hidden_dim=[60,40,20]).to(device)
print(model)

In [None]:
print(pms.summary(model, torch.zeros(1,400,20).to(device), show_input=True))

In [None]:
400*20

As we did before, playing a bit with the model cannot hurt : 

In [None]:

model.eval()
x, y = valid_dataset[:5] ## let's go with a batch of 5 samples

with torch.no_grad(): ## disables tracking of gradient: prevent accidental training + speeds up computation
    x = x.to(device)
    y = y.to(device)
    pred = model(x)
    predicted, actual = pred[0], y
    print(f'Predicted proba: "{predicted}", Actual: "{actual}"')

In [None]:
pred

In [None]:
## from "logits" to probabilities
nn.Softmax(dim=1)( pred )

In [None]:
## getting the predicted category as the one with the highest score
np.argmax(pred.cpu().detach(), axis = 1)

In [None]:
pred

In [None]:
CEloss = nn.CrossEntropyLoss()
CEloss(pred,y)

Now we can be reassured that our model does take the data as input, and output something we can compute a score on.

### defining training/validation functions

Our cross entropy loss is all good and well, but it would be nice to be able to compute additionnal metrics while we train.

Let's adapt our training function to reflect this

 * [accuracy](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score)
 * [balanced_accuracy](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html#sklearn.metrics.balanced_accuracy_score)
 * [F1_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score)

In [None]:
from sklearn import metrics
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced' , 
                                     classes = np.array(list( range(10) )), 
                                     y= y_train
                                    )


def get_additional_scores( predicted , target ):
    sample_weights = class_weights[target]

    return { 'balanced_accuracy' : metrics.balanced_accuracy_score( target , predicted , sample_weight=sample_weights ),
             'accuracy' : metrics.accuracy_score( target , predicted ),
             'f1' : metrics.f1_score( target , predicted ,
                                     average = 'macro' ) }



get_additional_scores( np.argmax(pred.detach().cpu().numpy(),axis=1) , y.cpu().numpy() )

In [None]:
def train(dataloader, model, loss_fn, optimizer , additional_score_function , echo = True , echo_batch = False):
    
    size = len(dataloader.dataset) # how many batches do we have
    model.train() #     Sets the module in training mode.
    
    ## we will keep prediction and target on the whole dataset
    all_predictions = []
    all_targets = []
    
    for batch, (X, y) in enumerate(dataloader): # for each batch
        X, y = X.to(device), y.to(device) # send the data to the GPU or whatever device you use for training

        # Compute prediction error
        pred = model(X)              # prediction for the model -> forward pass
        loss = loss_fn(pred, y)      # loss function from these prediction
        
        ## accumulate prediction and target on the whole dataset
        all_predictions.extend( np.argmax(pred.detach().cpu().numpy() , axis=1) )
        all_targets.extend( y.cpu().numpy() )
        
        
        # Backpropagation
        loss.backward()              # backward propagation 
        #                            https://ml-cheatsheet.readthedocs.io/en/latest/backpropagation.html
        #                            https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html
        
        optimizer.step()             
        optimizer.zero_grad()        # reset the gradients
                                     # https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch
        
        if echo_batch:
            current =  (batch + 1) * len(X)
            print(f"Train loss: {loss.item():>7f}  [{current:>5d}/{size:>5d}]")
    
    if echo:
        current =  (batch + 1) * len(X)
        print(f"Train loss: {loss.item():>7f}")

    
    # return the last batch loss, as well as the metrics computed on all batches
    scores = additional_score_function( all_predictions , all_targets )
    scores['loss'] = loss.item()
    return scores



In [None]:
def valid(dataloader, model, loss_fn, additional_score_function, echo = True):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval() #     Sets the module in evaluation mode
    
    ## we will keep prediction and target on the whole dataset
    all_predictions = []
    all_targets = []
    
    valid_loss = 0
    with torch.no_grad(): ## disables tracking of gradient: prevent accidental training + speeds up computation
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            valid_loss += loss_fn(pred, y).item()  ## accumulating the loss function over the batches
            
            ## accumulate prediction and target on the whole dataset
            all_predictions.extend( np.argmax(pred.detach().cpu().numpy() , axis=1) )
            all_targets.extend( y.cpu().numpy() )

            
    valid_loss /= num_batches

    if echo:
        print(f"\tValid loss: {valid_loss:>8f}")
    ## return the average loss / batch
    scores = additional_score_function( all_predictions , all_targets )
    scores['loss'] = valid_loss
    return  scores


## training the model

Our optimizer will be [ADAM](https://ml-cheatsheet.readthedocs.io/en/latest/optimizers.html#adam)

In [None]:
## preamble -> define the model, the loss function, and the optimizer
model = ProteinLoc_neuralNet(input_dim = 8000 , 
                             hidden_dim=[80] ,
                             output_dim = 10 , 
                             dropout_fraction = 0.1).to(device)


W = torch.Tensor( compute_class_weight(class_weight='balanced' , 
                     classes = np.array( list( range(10) ) ), 
                     y= y_train) ).to(device)

CEloss = nn.CrossEntropyLoss(weight = W)
print('weights_classes',W.cpu().numpy())


optimizer = torch.optim.Adam(model.parameters(), 
                       lr = 10**-4,
                       weight_decay = 10**-2)


## container to keep the scores across all epochs
train_scores = defaultdict(list)
valid_scores = defaultdict(list)


In [None]:
%%time
## lets do a single round, to learn how long it takes
tmp_scores = train(train_dataloader, 
                   model, 
                   CEloss, 
                   optimizer, 
                   get_additional_scores , 
                   echo = True , echo_batch = True )
for k in tmp_scores:
    train_scores[k].append( tmp_scores[k] )



tmp_scores = valid(valid_dataloader, 
                   model, 
                   CEloss , 
                   get_additional_scores ,
                   echo = True)
for k in tmp_scores:
    valid_scores[k].append( tmp_scores[k] )


From there, we can deduce approximately how much time training for 50, 100, or 500 epoch will take.

Here: 192ms x 100 = 19.2s

In [None]:
%%time
epochs = 100


for t in range(1,epochs+1):

    echo = t%10==0
    if echo:
        print('Epoch',t )    
    
    tmp_scores = train(train_dataloader, 
                       model, 
                       CEloss, 
                       optimizer, 
                       get_additional_scores , 
                       echo= echo , echo_batch=False )
    for k in tmp_scores:
        train_scores[k].append( tmp_scores[k] )



    tmp_scores = valid(valid_dataloader, 
                       model, 
                       CEloss , 
                       get_additional_scores ,
                       echo = echo)
    for k in tmp_scores:
        valid_scores[k].append( tmp_scores[k] )
print("Done!")

In [None]:
fig, axes = plt.subplots(2,2,figsize = (14,8))    


for i,k in enumerate( ['loss', 'balanced_accuracy', 'accuracy', 'f1'] ) :

    axes[i//2][i%2].plot(train_scores[k] , label = 'train')
    axes[i//2][i%2].plot(valid_scores[k], label = 'validation')
    axes[i//2][i%2].legend()
    axes[i//2][i%2].set_xlabel('epoch')
    axes[i//2][i%2].set_ylabel(k)

Should we stop here ? Go for another 100 epochs? 500 epochs?

We could use a [early stopping](https://ml-cheatsheet.readthedocs.io/en/latest/regularization.html?highlight=dropout#early-stopping) mechanism, which stops iteration when the validation loss has not increased for `X` epochs (or when it has not improved by at least a given amount).

This can be particularly important in cases where your model starts to grossly overfit the training data.

For an implementation example, you can look up this [early stopping demo on MNIST](https://github.com/Bjarten/early-stopping-pytorch/blob/master/MNIST_Early_Stopping_example.ipynb), which we will use in the next notebooks

In [None]:
import pandas as pd
import seaborn as sns

y_pred = model(X_valid_tensor.to(device))
y_pred = np.argmax(y_pred.detach().cpu().numpy(), axis=1)

df = pd.crosstab( y_valid  , y_pred , rownames=['truth'] , colnames=['prediction'])
df.columns = classes
df.index = classes

# simple heatmap
#sns.heatmap(df , annot = True , fmt='.0f', cmap = 'viridis')

#trick to make the 0s dissapear
sns.heatmap(df , annot = df.astype(str).replace('0','') , fmt ='s' , cmap = 'viridis')
plt.ylabel('True label')
plt.xlabel('Predicted label')


---

## playground

what follows is just us playing around wit some code

### effect of drop-out fraction

In [None]:

def train_ProteinLoc_neuralNet( DO , epochs , model = None , CEloss=None, optimizer=None ):
    
    if model is None:
        ## preamble -> define the model, the loss function, and the optimizer
        model = ProteinLoc_neuralNet(input_dim = 8000 , 
                                     hidden_dim=[80] ,
                                     output_dim = 10 , 
                                     dropout_fraction = DO).to(device)


        W = torch.Tensor( compute_class_weight(class_weight='balanced' , 
                             classes = np.arange(10) , 
                             y= y_train) ).to(device)

        CEloss = nn.CrossEntropyLoss(weight = W)


        optimizer = torch.optim.Adam(model.parameters(), 
                               lr = 10**-4,
                               weight_decay = 10**-2)


    ## container to keep the scores across all epochs
    train_scores = defaultdict(list)
    valid_scores = defaultdict(list)

    for t in range(1,epochs+1):

        tmp_scores = train(train_dataloader, 
                           model, 
                           CEloss, 
                           optimizer, 
                           get_additional_scores , 
                           echo= False , echo_batch=False )
        for k in tmp_scores:
            train_scores[k].append( tmp_scores[k] )

        tmp_scores = valid(valid_dataloader, 
                           model, 
                           CEloss , 
                           get_additional_scores ,
                           echo = False)
        for k in tmp_scores:
            valid_scores[k].append( tmp_scores[k] )
        
    return train_scores,valid_scores , model, CEloss,optimizer 

In [None]:
%%time
results_dict = {'loss' :[], 'epoch':[],'drop_out':[], 'train_valid':[]}

E=300
for DO in np.arange(0,0.5,0.1):

    train_scores,valid_scores ,_,_,_ = train_ProteinLoc_neuralNet( DO=DO , epochs=E)

    results_dict['loss'].extend( train_scores['loss'] )
    results_dict['epoch'].extend( range(1,E+1) )
    results_dict['drop_out'].extend( [DO]*E )
    results_dict['train_valid'].extend( ['train']*E )

    results_dict['loss'].extend( valid_scores['loss'] )
    results_dict['epoch'].extend( range(1,E+1) )
    results_dict['drop_out'].extend( [DO]*E )
    results_dict['train_valid'].extend( ['valid']*E )


In [None]:
import pandas as pd
import seaborn as sns

df = pd.DataFrame(results_dict)
df.drop_out = df.drop_out.astype(str)
sns.lineplot( df , x='epoch' , y='loss' , hue = 'drop_out' , style = 'train_valid' , palette='viridis' )

In [None]:
sns.lineplot( df.loc[df.train_valid=='valid',:] , x='epoch' , y='loss' , hue = 'drop_out' , style = 'train_valid' , palette='viridis' )

### one more layer

In [None]:
## preamble -> define the model, the loss function, and the optimizer
model2 = ProteinLoc_neuralNet(input_dim = 8000 , 
                             hidden_dim=[160,80,40,20] ,
                             output_dim = 10 , 
                             dropout_fraction = 0.1).to(device)


W = torch.Tensor( compute_class_weight(class_weight='balanced' , 
                     classes = np.arange(10) , 
                     y= y_train) ).to(device)

CEloss = nn.CrossEntropyLoss(weight = W)
print('weights_classes',W.to('cpu').numpy())


optimizer = torch.optim.Adam(model2.parameters(), 
                       lr = 10**-5,
                       weight_decay = 10**-2)


## container to keep the scores across all epochs
train_scores2 = defaultdict(list)
valid_scores2 = defaultdict(list)


In [None]:
%%time
epochs = 1000


for t in range(1,epochs+1):

    echo = t%10==0
    if echo:
        print('Epoch',t )    
    
    tmp_scores = train(train_dataloader, 
                       model2, 
                       CEloss, 
                       optimizer, 
                       get_additional_scores , 
                       echo= echo , echo_batch=False )
    for k in tmp_scores:
        train_scores2[k].append( tmp_scores[k] )



    tmp_scores = valid(valid_dataloader, 
                       model2, 
                       CEloss , 
                       get_additional_scores ,
                       echo = echo)
    for k in tmp_scores:
        valid_scores2[k].append( tmp_scores[k] )
print("Done!")
fig, axes = plt.subplots(2,2,figsize = (14,8))    


for i,k in enumerate( ['loss', 'balanced_accuracy', 'accuracy', 'f1'] ) :

    axes[i//2][i%2].plot(train_scores2[k] , label = 'train')
    axes[i//2][i%2].plot(valid_scores2[k], label = 'validation')
    axes[i//2][i%2].legend()
    axes[i//2][i%2].set_xlabel('epoch')
    axes[i//2][i%2].set_ylabel(k)

In [None]:
valid_scores2['accuracy'][-1] , valid_scores['accuracy'][-1]

In [None]:

class ProteinLoc_neuralNet_ELU(torch.nn.Module):
    
    def __init__(self , input_dim = 8000 , 
                         hidden_dim=[80] ,
                         output_dim = 10 , 
                         dropout_fraction = 0.25):
        super().__init__()
        
        ## we transform the input from 2D to 1D
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(  )
        
        # each layer is made of a linear layer with a ReLu activation and a DropOut Layer
        for i in range(len(hidden_dim)):
            
            self.layers.append( nn.Linear(input_dim, hidden_dim[i]) )
            self.layers.append( nn.ELU(1.0) )
            self.layers.append( nn.Dropout(dropout_fraction) )
            
            input_dim = hidden_dim[i] ## update the input dimension for the next layer
        
        self.layers.append( nn.Linear(input_dim, output_dim) )
        
    def forward(self, x):
        x = self.flatten(x)
        ## NB: here, the output of the last layer are logits
        logits = self.layers(x)
        return logits




In [None]:
## preamble -> define the model, the loss function, and the optimizer
model3 = ProteinLoc_neuralNet_ELU(input_dim = 8000 , 
                             hidden_dim=[80] ,
                             output_dim = 10 , 
                             dropout_fraction = 0.1).to(device)


W = torch.Tensor( compute_class_weight(class_weight='balanced' , 
                     classes = np.arange(10) , 
                     y= y_train) )

CEloss = nn.CrossEntropyLoss(weight = W)
print('weights_classes',W.numpy())


optimizer = torch.optim.Adam(model3.parameters(), 
                       lr = 10**-4,
                       weight_decay = 10**-2)


## container to keep the scores across all epochs
train_scores3 = defaultdict(list)
valid_scores3 = defaultdict(list)


In [None]:
%%time
epochs = 100


for t in range(1,epochs+1):

    echo = t%10==0
    if echo:
        print('Epoch',t )    
    
    tmp_scores = train(train_dataloader, 
                       model3, 
                       CEloss, 
                       optimizer, 
                       get_additional_scores , 
                       echo= echo , echo_batch=False )
    for k in tmp_scores:
        train_scores3[k].append( tmp_scores[k] )



    tmp_scores = valid(valid_dataloader, 
                       model3, 
                       CEloss , 
                       get_additional_scores ,
                       echo = echo)
    for k in tmp_scores:
        valid_scores3[k].append( tmp_scores[k] )
print("Done!")
fig, axes = plt.subplots(2,2,figsize = (14,8))    


for i,k in enumerate( ['loss', 'balanced_accuracy', 'accuracy', 'f1'] ) :

    axes[i//2][i%2].plot(train_scores3[k] , label = 'train')
    axes[i//2][i%2].plot(valid_scores3[k], label = 'validation')
    axes[i//2][i%2].legend()
    axes[i//2][i%2].set_xlabel('epoch')
    axes[i//2][i%2].set_ylabel(k)

In [None]:
valid_scores3['loss'][-1] , valid_scores2['loss'][-1] , valid_scores['loss'][-1]