In [1]:
%cd /content/drive/MyDrive/MLCB/Project/Data

/content/drive/MyDrive/MLCB/Project/Data


In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import json
from sklearn.preprocessing import normalize

In [None]:
drug_csv = pd.read_csv("pro_drug_with_smiles.csv")
drug_csv = drug_csv.drop(columns=["target_ids","enzyme_ids","smile"])

In [None]:
drug_csv.columns

Index(['index', 'id', 'target', 'enzyme', 'pathway', 'name', 'SMILES'], dtype='object')

In [None]:
def process_drug_features(drug_df):
    # 1. Build the full list of all unique targets and enzymes
    all_targets = set()
    all_enzymes = set()

    for targets in drug_df['target'].dropna():
        all_targets.update(targets.split('|'))

    for enzymes in drug_df['enzyme'].dropna():
        all_enzymes.update(enzymes.split('|'))

    all_targets = sorted(list(all_targets))
    all_enzymes = sorted(list(all_enzymes))

    # 2. Map them to unique IDs
    target_to_id = {t: i for i, t in enumerate(all_targets)}
    enzyme_to_id = {e: i for i, e in enumerate(all_enzymes)}

    # 3. Replace target and enzyme lists with list of IDs
    def replace_with_ids(value, mapping):
        if pd.isna(value):
            return []
        return [mapping[v] for v in value.split('|') if v in mapping]

    drug_df['target_ids'] = drug_df['target'].apply(lambda x: replace_with_ids(x, target_to_id))
    drug_df['enzyme_ids'] = drug_df['enzyme'].apply(lambda x: replace_with_ids(x, enzyme_to_id))

    return drug_df, target_to_id, enzyme_to_id


In [None]:
drug_csv,target_to_id,enzyme_to_id = process_drug_features(drug_csv)

In [None]:
drug_csv.head()

Unnamed: 0,index,id,target,enzyme,pathway,name,SMILES,target_ids,enzyme_ids
0,0,DB01296,P14780|Q00653|P01375|P01579|P33673,P33261|P05181,hsa:4318|hsa:4791|hsa:7124|hsa:3458,Glucosamine,C(C1C(C(C(C(O1)O)N)O)O)O,"[333, 727, 161, 162, 500]","[122, 46]"
1,1,DB09230,Q02641,P08684,hsa:782,Azelnidipine,CC1=C(C(C(=C(N1)N)C(=O)OC2CN(C2)C(C3=CC=CC=C3)...,[735],[56]
2,2,DB05812,P05093,P08684|Q06520|P10635|P10632|P05177|P33261|P11712,hsa:1586,Abiraterone,CC12CCC(CC1=CCC3C2CCC4(C3CC=C4C5=CN=CC=C5)C)O,[192],"[56, 160, 64, 63, 45, 122, 69]"
3,3,DB01195,Q14524|P35499|Q12809,P10635|P11712,hsa:6331|hsa:6329|hsa:3757,Flecainide,C1CCNC(C1)CNC(=O)C2=C(C=CC(=C2)OCC(F)(F)F)OCC(...,"[819, 517, 776]","[64, 69]"
4,4,DB00201,P30542|P29274|Q07343|P21817|BE0004922|P78527|O...,P20815|P05177|P24462|P08684|P05181|P10632|P117...,hsa:134|hsa:135|hsa:5142|hsa:6261|hsa:5591|hsa...,Caffeine,CN1C=NC2=C1C(=O)N(C(=O)N2C)C,"[483, 462, 760, 398, 20, 707, 54, 559, 560, 19...","[91, 45, 104, 56, 46, 63, 69, 39, 169, 64]"


In [None]:
class FeatureExtractor:
    def __init__(self, drug_df, target_to_id, enzyme_to_id, smiles_model_name="seyonec/ChemBERTa-zinc-base-v1"):
        self.drug_df = drug_df
        self.target_to_id = target_to_id
        self.enzyme_to_id = enzyme_to_id
        self.num_targets = len(target_to_id)
        self.num_enzymes = len(enzyme_to_id)
        self.tokenizer = AutoTokenizer.from_pretrained(smiles_model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = AutoModel.from_pretrained(smiles_model_name).to(self.device)
        self.model.eval()

    def encode_targets(self, target_ids):
        """One-hot encode a list of target IDs."""
        vec = np.zeros(self.num_targets)
        for idx in target_ids:
            vec[idx] = 1
        return vec

    def encode_enzymes(self, enzyme_ids):
        """One-hot encode a list of enzyme IDs."""
        vec = np.zeros(self.num_enzymes)
        for idx in enzyme_ids:
            vec[idx] = 1
        return vec

    def encode_smiles(self, smiles):
        """Encode SMILES using ChemBERT model, return [CLS] token embedding."""
        if pd.isna(smiles):
            return np.zeros(self.model.config.hidden_size)

        # inputs = self.tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
        # with torch.no_grad():
        #     outputs = self.model(**inputs)
        inputs = {k: v.to(self.device) for k, v in self.tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512).items()}
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Take the CLS token output
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # (batch_size=1, hidden_size)
        return cls_embedding.squeeze(0).cpu().numpy()
    from sklearn.preprocessing import normalize  # make sure this is imported!

    def encode_smiles_n(self, smiles):
        """Encode SMILES using ChemBERT model, return L2-normalized [CLS] token embedding."""
        if pd.isna(smiles):
            return np.zeros(self.model.config.hidden_size)

        inputs = {k: v.to(self.device) for k, v in self.tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512).items()}
        with torch.no_grad():
            outputs = self.model(**inputs)

        cls_embedding = outputs.last_hidden_state[:, 0, :]  # (batch_size=1, hidden_size)
        cls_embedding_np = cls_embedding.squeeze(0).cpu().numpy()

        # L2 normalize
        cls_embedding_np = normalize(cls_embedding_np.reshape(1, -1), norm='l2').squeeze(0)

        return cls_embedding_np


    def extract_features(self):
        features = {}

        for _, row in self.drug_df.iterrows():
            drug_id = row['id']
            target_vec = self.encode_targets(row['target_ids'])
            enzyme_vec = self.encode_enzymes(row['enzyme_ids'])
            smiles_embedding = self.encode_smiles(row['SMILES'])

            features[drug_id] = {
                'target': target_vec,
                'enzyme': enzyme_vec,
                'smiles': smiles_embedding
            }

        return features


In [None]:
extractor = FeatureExtractor(
    drug_df=drug_csv,
    target_to_id=target_to_id,
    enzyme_to_id=enzyme_to_id
)


In [None]:
drug_features = extractor.extract_features()

In [None]:
drug_features["DB01296"]

{'target': array([0., 0., 0., ..., 0., 0., 0.]),
 'enzyme': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'smiles': array([ 3.52851376e-02, 

DRU FEATURES RIGHT NOW HAS ONE HOT ENCODING OF PRESENCE OR ABSENCE OF A CERTAIN ENZYME


In [None]:
import pickle

def save_drug_features(drug_features, filename="drug_features.pkl"):
    with open(filename, 'wb') as f:
        pickle.dump(drug_features, f)

In [None]:
save_drug_features(drug_features, "drug_features.pkl")