# Part 1: Prepare the development environment.

In [1]:
# import the required packages

import math
import numpy as np
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.neural_network import MLPClassifier
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from transformers import AutoTokenizer, EsmForSequenceClassification
import warnings

### The following data files are used in this notebook: 


# Part 2: Tokenize the dataset.

In [None]:
def token(data,model_name):
  y=data.iloc[:, -1]
  print("labels:", y.shape)
  X = []
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  for i in range(len(data)):
    compound, protein, interaction = data.iloc[i, :]
    dict_item = {}
    mol = Chem.AddHs(Chem.MolFromSmiles(compound))
    fp = np.array([int(i) for i in AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=4, useChirality=True).ToBitString()])
    pn = tokenizer(protein)
    input_item= torch.from_numpy(np.array([np.concatenate((pn['input_ids'],fp), axis=0)], dtype='int32'))
    mask_item = torch.from_numpy(np.array([np.concatenate((pn['attention_mask'],np.ones(len(fp))), axis=0)], dtype='int32'))
    dict_item['input_ids']= input_item
    dict_item['attention_mask']= mask_item
    #print(dict_item)
    X.append(dict_item)

  return X, y

### Future Work: What is that future work?

In [None]:
# the aim to assign the fingerprint of compound as larger tokenizor, for example making fingerprints set {0,1} as set {40,50}, is to make compound data distinct from protein data
# currently the maximum index in ESM tokenization cannot support this idea, so this project keeps the fingerprints as set {0,1}

fp_original = np.array([int(i) for i in AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=4, useChirality=True).ToBitString()])
fp_update = np.where(fp_original==1, 40, 50)

In [None]:
# (1) the order of compound & protein in the combined sequence may impact the final result
# (2) the position of compound inserted in the protein sequence may impact the final result

#input_item= torch.from_numpy(np.array([np.concatenate((fp,pn['input_ids']), axis=0)], dtype='int32'))

# Part 3: Pretrain Protein Sequence Model

In [None]:
def feature_extraction(input):

  with torch.no_grad():
    item = model(**input, labels = torch.tensor([y.iloc[i]]))
    logits = item.logits
    loss = item.loss
    #print("The original feature extraction is :","features as",logits,"loss compared with labels as", loss)

  return logits, loss

In [None]:
data = pd.read_csv('test_celegans.csv', header=None)
data = data.sample(frac=1)
model_name = "facebook/esm2_t6_8M_UR50D"
inputs,y = token(data,model_name)

model = EsmForSequenceClassification.from_pretrained(model_name, num_labels=len(y))
features = []
for i in range(len(inputs)):
  input = inputs[i]
  logits, loss = feature_extraction(input)
  feature = logits.numpy()[0]
  features.append(feature)

features = pd.DataFrame(features)
log = MLPClassifier(random_state=42, hidden_layer_sizes=(5, ), alpha=0.01, max_iter=200,  activation='logistic')
log.fit(features,y)
y_pred = log.predict(features)
print("RMSE:", np.sqrt(np.mean((y_pred - y) ** 2)))


### Future Work: LANGUAGE MODELING WITH NN.TRANSFORMER

In [None]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of ``-inf``, with zeros on ``diag``."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [None]:
# PositionalEncoding module injects some information about the relative or absolute position of the tokens in the sequence. 
# The positional encodings have the same dimension as the embeddings so that the two can be summed. 
# Here, we use sine and cosine functions of different frequencies.

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# reference : https://pytorch.org/tutorials/beginner/transformer_tutorial.html