In [3]:
%cd /content/drive/MyDrive/MLCB/Project/Data

/content/drive/MyDrive/MLCB/Project/Data


In [2]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import json


In [4]:
drug_csv = pd.read_csv("drug.csv")

In [5]:
drug_csv.head()

Unnamed: 0,index,id,target,enzyme,pathway,smile,name
0,0,DB01296,P14780|Q00653|P01375|P01579|P33673,P33261|P05181,hsa:4318|hsa:4791|hsa:7124|hsa:3458,9|10|14|18|19|20|178|181|283|284|285|286|299|3...,Glucosamine
1,1,DB09230,Q02641,P08684,hsa:782,9|10|11|12|13|14|15|16|18|19|20|129|131|132|17...,Azelnidipine
2,2,DB05812,P05093,P08684|Q06520|P10635|P10632|P05177|P33261|P11712,hsa:1586,9|10|11|12|14|18|143|147|178|179|182|183|184|1...,Abiraterone
3,3,DB01195,Q14524|P35499|Q12809,P10635|P11712,hsa:6331|hsa:6329|hsa:3757,9|10|11|12|14|15|18|19|23|24|25|178|180|181|18...,Flecainide
4,4,DB00201,P30542|P29274|Q07343|P21817|BE0004922|P78527|O...,P20815|P05177|P24462|P08684|P05181|P10632|P117...,hsa:134|hsa:135|hsa:5142|hsa:6261|hsa:5591|hsa...,9|10|11|14|15|16|18|19|143|148|149|178|183|184...,Caffeine


In [6]:
import pandas as pd
import numpy as np

def process_drug_features(drug_df):
    # 1. Build full list of all unique targets, enzymes, and smile tokens
    all_targets = set()
    all_enzymes = set()
    all_smiles = set()

    for targets in drug_df['target'].dropna():
        all_targets.update(targets.split('|'))

    for enzymes in drug_df['enzyme'].dropna():
        all_enzymes.update(enzymes.split('|'))

    for smiles in drug_df['smile'].dropna():
        all_smiles.update(smiles.split('|'))

    all_targets = sorted(list(all_targets))
    all_enzymes = sorted(list(all_enzymes))
    all_smiles = sorted(list(all_smiles))

    # 2. Map them to unique IDs
    target_to_id = {t: i for i, t in enumerate(all_targets)}
    enzyme_to_id = {e: i for i, e in enumerate(all_enzymes)}
    smiles_to_id = {s: i for i, s in enumerate(all_smiles)}

    # 3. Replace targets, enzymes, smiles with list of IDs
    def replace_with_ids(value, mapping):
        if pd.isna(value):
            return []
        return [mapping[v] for v in value.split('|') if v in mapping]

    drug_df['target_ids'] = drug_df['target'].apply(lambda x: replace_with_ids(x, target_to_id))
    drug_df['enzyme_ids'] = drug_df['enzyme'].apply(lambda x: replace_with_ids(x, enzyme_to_id))
    drug_df['smile_ids'] = drug_df['smile'].apply(lambda x: replace_with_ids(x, smiles_to_id))

    return drug_df, target_to_id, enzyme_to_id, smiles_to_id


In [7]:
drug_csv,target_to_id,enzyme_to_id, smiles_to_id = process_drug_features(drug_csv)

In [8]:
drug_csv.head()

Unnamed: 0,index,id,target,enzyme,pathway,smile,name,target_ids,enzyme_ids,smile_ids
0,0,DB01296,P14780|Q00653|P01375|P01579|P33673,P33261|P05181,hsa:4318|hsa:4791|hsa:7124|hsa:3458,9|10|14|18|19|20|178|181|283|284|285|286|299|3...,Glucosamine,"[333, 727, 161, 162, 500]","[122, 46]","[580, 2, 15, 43, 54, 65, 41, 45, 94, 95, 96, 9..."
1,1,DB09230,Q02641,P08684,hsa:782,9|10|11|12|13|14|15|16|18|19|20|129|131|132|17...,Azelnidipine,[735],[56],"[580, 2, 3, 7, 11, 15, 23, 32, 43, 54, 65, 10,..."
2,2,DB05812,P05093,P08684|Q06520|P10635|P10632|P05177|P33261|P11712,hsa:1586,9|10|11|12|14|18|143|147|178|179|182|183|184|1...,Abiraterone,[192],"[56, 160, 64, 63, 45, 122, 69]","[580, 2, 3, 7, 15, 43, 16, 20, 41, 42, 46, 47,..."
3,3,DB01195,Q14524|P35499|Q12809,P10635|P11712,hsa:6331|hsa:6329|hsa:3757,9|10|11|12|14|15|18|19|23|24|25|178|180|181|18...,Flecainide,"[819, 517, 776]","[64, 69]","[580, 2, 3, 7, 15, 23, 43, 54, 84, 87, 88, 41,..."
4,4,DB00201,P30542|P29274|Q07343|P21817|BE0004922|P78527|O...,P20815|P05177|P24462|P08684|P05181|P10632|P117...,hsa:134|hsa:135|hsa:5142|hsa:6261|hsa:5591|hsa...,9|10|11|14|15|16|18|19|143|148|149|178|183|184...,Caffeine,"[483, 462, 760, 398, 20, 707, 54, 559, 560, 19...","[91, 45, 104, 56, 46, 63, 69, 39, 169, 64]","[580, 2, 3, 15, 23, 32, 43, 54, 16, 21, 22, 41..."


In [9]:
import numpy as np
import pandas as pd
import torch

class FeatureExtractor:
    def __init__(self, drug_df, target_to_id, enzyme_to_id, smiles_to_id):
        self.drug_df = drug_df
        self.target_to_id = target_to_id
        self.enzyme_to_id = enzyme_to_id
        self.smiles_to_id = smiles_to_id

        self.num_targets = len(target_to_id)
        self.num_enzymes = len(enzyme_to_id)
        self.num_smiles = len(smiles_to_id)

    def encode_targets(self, target_ids):
        """One-hot encode a list of target IDs."""
        vec = np.zeros(self.num_targets, dtype=np.float32)
        for idx in target_ids:
            vec[idx] = 1.0
        return vec

    def encode_enzymes(self, enzyme_ids):
        """One-hot encode a list of enzyme IDs."""
        vec = np.zeros(self.num_enzymes, dtype=np.float32)
        for idx in enzyme_ids:
            vec[idx] = 1.0
        return vec

    def encode_smiles(self, smile_ids):
        """One-hot encode a list of SMILES token IDs."""
        vec = np.zeros(self.num_smiles, dtype=np.float32)
        for idx in smile_ids:
            vec[idx] = 1.0
        return vec

    def extract_features(self):
        """Extract one-hot features for each drug."""
        features = {}

        for _, row in self.drug_df.iterrows():
            drug_id = row['id']
            target_vec = self.encode_targets(row['target_ids'])
            enzyme_vec = self.encode_enzymes(row['enzyme_ids'])
            smiles_vec = self.encode_smiles(row['smile_ids'])

            features[drug_id] = {
                'target': target_vec,
                'enzyme': enzyme_vec,
                'smiles': smiles_vec
            }

        return features


In [11]:
extractor = FeatureExtractor(
    drug_df=drug_csv,
    target_to_id=target_to_id,
    enzyme_to_id=enzyme_to_id,
    smiles_to_id=smiles_to_id
)


In [12]:
drug_features = extractor.extract_features()

In [13]:
drug_features["DB01296"]

{'target': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'enzyme': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32)

DRU FEATURES RIGHT NOW HAS ONE HOT ENCODING OF PRESENCE OR ABSENCE OF A CERTAIN ENZYME


In [16]:
import pickle

def save_drug_features(drug_features, filename="drug_features_onehot.pkl"):
    with open(filename, 'wb') as f:
        pickle.dump(drug_features, f)

In [17]:
save_drug_features(drug_features, "drug_features_onehot.pkl")