In [None]:
pip install pubchempy

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13818 sha256=d9fa6b495552cb5b349755a179a2c01f606691beac98cede0672ca16070afd8c
  Stored in directory: /root/.cache/pip/wheels/8b/e3/6c/3385b2db08b0985a87f5b117f98d0cb61a3ae3ca3bcbbd8307
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4


In [None]:
import pandas as pd
import pubchempy as pcp

# Step 1: Load your CSV
df = pd.read_csv('Processed_Drug.csv')  # Replace with your actual filename

# Step 2: Define a function to get SMILES from molecule name
def name_to_smiles(name):
    try:
        compounds = pcp.get_compounds(name, 'name')
        if compounds:
            return compounds[0].canonical_smiles
        else:
            return None
    except:
        return None

# Step 3: Apply the function to the desired column
df['SMILES'] = df['name'].apply(name_to_smiles)  # Replace 'MoleculeName' with your column name

# Step 4: Save the new CSV with SMILES
df.to_csv('output_with_smiles.csv', index=False)

print("Done! Check 'pro_drug_with_smiles.csv'.")


Done! Check 'pro_drug_with_smiles.csv'.


In [None]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [None]:
# --- Feature Extraction Notebook (Improved) ---

# Install dependencies if needed
# pip install transformers rdkit-pypi torch pandas numpy

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import json
import pickle

from rdkit import Chem

# --- Step 1: Load Data ---


In [None]:
# Read CSV
df = pd.read_csv("pro_drug_with_smiles.csv")

# Fix column
df['smile'] = df['SMILES']  # Copy correct SMILES into the expected column

# Now proceed as before

In [None]:
# --- Step 2: Preprocessing: Build ID mappings ---

def process_drug_features(drug_df):
    all_targets = set()
    all_enzymes = set()

    for targets in drug_df['target'].dropna():
        all_targets.update(targets.split('|'))

    for enzymes in drug_df['enzyme'].dropna():
        all_enzymes.update(enzymes.split('|'))

    all_targets = sorted(list(all_targets))
    all_enzymes = sorted(list(all_enzymes))

    target_to_id = {t: i for i, t in enumerate(all_targets)}
    enzyme_to_id = {e: i for i, e in enumerate(all_enzymes)}

    def replace_with_ids(value, mapping):
        if pd.isna(value):
            return []
        return [mapping[v] for v in value.split('|') if v in mapping]

    drug_df['target_ids'] = drug_df['target'].apply(lambda x: replace_with_ids(x, target_to_id))
    drug_df['enzyme_ids'] = drug_df['enzyme'].apply(lambda x: replace_with_ids(x, enzyme_to_id))

    return drug_df, target_to_id, enzyme_to_id

# Process the data
drug_csv, target_to_id, enzyme_to_id = process_drug_features(drug_csv)

In [None]:
# --- Step 3: Feature Extraction Class ---

class FeatureExtractor:
    def __init__(self, drug_df, target_to_id, enzyme_to_id, smiles_model_name="seyonec/ChemBERTa-zinc-base-v1", batch_size=32):
        self.drug_df = drug_df
        self.target_to_id = target_to_id
        self.enzyme_to_id = enzyme_to_id
        self.num_targets = len(target_to_id)
        self.num_enzymes = len(enzyme_to_id)

        self.tokenizer = AutoTokenizer.from_pretrained(smiles_model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = AutoModel.from_pretrained(smiles_model_name).to(self.device)
        self.model.eval()

        self.batch_size = batch_size

    def encode_targets(self, target_ids):
        vec = np.zeros(self.num_targets, dtype=np.float32)
        for idx in target_ids:
            vec[idx] = 1.0
        return vec

    def encode_enzymes(self, enzyme_ids):
        vec = np.zeros(self.num_enzymes, dtype=np.float32)
        for idx in enzyme_ids:
            vec[idx] = 1.0
        return vec

    def encode_smiles_batch(self, smiles_list):
        inputs = self.tokenizer(smiles_list, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # (batch_size, hidden_dim)
        return cls_embeddings.cpu().numpy()

    def extract_features(self):
        features = {}

        smiles_data = list(self.drug_df[['id', 'smile']].itertuples(index=False))
        all_smiles = [s.smile if pd.notna(s.smile) else "" for s in smiles_data]
        all_ids = [s.id for s in smiles_data]

        # Batch SMILES encoding
        smiles_embeddings = {}
        for i in range(0, len(all_smiles), self.batch_size):
            batch_smiles = all_smiles[i:i+self.batch_size]
            batch_ids = all_ids[i:i+self.batch_size]
            batch_embeddings = self.encode_smiles_batch(batch_smiles)
            for drug_id, emb in zip(batch_ids, batch_embeddings):
                smiles_embeddings[drug_id] = emb

        # Build feature dict
        for _, row in self.drug_df.iterrows():
            drug_id = row['id']
            target_vec = self.encode_targets(row['target_ids'])
            enzyme_vec = self.encode_enzymes(row['enzyme_ids'])
            smiles_vec = smiles_embeddings.get(drug_id, np.zeros(self.model.config.hidden_size, dtype=np.float32))

            features[drug_id] = {
                'target': target_vec,
                'enzyme': enzyme_vec,
                'smiles': smiles_vec
            }

        return features


In [None]:
# --- Step 4: Run Feature Extraction ---

extractor = FeatureExtractor(
    drug_df=drug_csv,
    target_to_id=target_to_id,
    enzyme_to_id=enzyme_to_id
)

drug_features = extractor.extract_features()

# --- Step 5: Save Features ---

def save_drug_features(features, filename="drug_features_embedded.pkl"):
    with open(filename, 'wb') as f:
        pickle.dump(features, f)

save_drug_features(drug_features)

print("Feature extraction complete. Features saved to 'drug_features_embedded.pkl'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/501 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/9.43k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/179M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/179M [00:00<?, ?B/s]

Feature extraction complete. Features saved to 'drug_features_embedded.pkl'
