In [12]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MolFromSmiles

def load_dataset(file_path):
    """
    Load the dataset from a CSV file.
    Expected columns: 'compound_iso_smiles', 'target_sequence', 'Ki , Kd and IC50  (KIBA Score)'.
    """
    data = pd.read_csv(file_path)
    smiles = data['compound_iso_smiles'].values
    sequences = data['target_sequence'].values
    kiba_scores = data['Ki , Kd and IC50  (KIBA Score)'].values
    return smiles, sequences, kiba_scores

def atom_features(atom):
    """Generate a feature vector for an atom in a molecule."""
    return np.array([
        atom.GetAtomicNum(),  # Atomic number
        atom.GetDegree(),  # Number of bonds
        atom.GetTotalNumHs(),  # Number of hydrogen atoms
        atom.GetImplicitValence(),  # Implicit valence
        atom.GetIsAromatic()  # Aromaticity
    ])

def smile_to_graph(smile):
    """
    Convert a SMILES string into a molecular graph.
    Returns the number of atoms, atom features, and adjacency matrix.
    """
    mol = MolFromSmiles(smile)
    c_size = mol.GetNumAtoms()
    
    features = []
    for atom in mol.GetAtoms():
        feature = atom_features(atom)
        features.append(feature)
    
    edges = []
    for bond in mol.GetBonds():
        edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
    
    adjacency_matrix = np.zeros((c_size, c_size))
    for edge in edges:
        adjacency_matrix[edge[0], edge[1]] = 1
        adjacency_matrix[edge[1], edge[0]] = 1  # since it's undirected
    
    return c_size, np.array(features), adjacency_matrix

def process_smiles(smiles_list):
    """
    Process a list of SMILES strings into molecular graphs.
    Returns a list of graphs with their features and adjacency matrices.
    """
    graphs = []
    for smile in smiles_list:
        graph = smile_to_graph(smile)
        graphs.append(graph)
    return graphs

# Replace 'path_to_your_dataset.csv' with the actual path to your dataset file
file_path = 'KIBA.csv'
smiles, sequences, kiba_scores = load_dataset(file_path)

# Process the SMILES strings into graphs
molecular_graphs = process_smiles(smiles)

# Example output for the first molecule
print("Number of atoms in first molecule:", molecular_graphs[0][0])
print("Atom features of first molecule:\n", molecular_graphs[0][1])
print("Adjacency matrix of first molecule:\n", molecular_graphs[0][2])


ModuleNotFoundError: No module named 'pandas'

In [6]:
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MolFromSmiles
import pconsc4

# 1. Load the dataset
def load_dataset(file_path):
    """
    Load the dataset from a CSV file.
    Expected columns: 'compound_iso_smiles', 'target_sequence', 'Ki , Kd and IC50  (KIBA Score)'.
    """
    data = pd.read_csv(file_path)
    smiles = data['compound_iso_smiles'].values
    sequences = data['target_sequence'].values
    kiba_scores = data['Ki , Kd and IC50  (KIBA Score)'].values
    protein_ids = data['ProteinID'].values  # Assuming ProteinID is available
    return smiles, sequences, kiba_scores, protein_ids

# 2. Process SMILES strings into molecular graphs
def atom_features(atom):
    """Generate a feature vector for an atom in a molecule."""
    return np.array([
        atom.GetAtomicNum(),  # Atomic number
        atom.GetDegree(),  # Number of bonds
        atom.GetTotalNumHs(),  # Number of hydrogen atoms
        atom.GetImplicitValence(),  # Implicit valence
        atom.GetIsAromatic()  # Aromaticity
    ])

def smile_to_graph(smile):
    """
    Convert a SMILES string into a molecular graph.
    Returns the number of atoms, atom features, and adjacency matrix.
    """
    mol = MolFromSmiles(smile)
    c_size = mol.GetNumAtoms()
    
    features = []
    for atom in mol.GetAtoms():
        feature = atom_features(atom)
        features.append(feature)
    
    edges = []
    for bond in mol.GetBonds():
        edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
    
    adjacency_matrix = np.zeros((c_size, c_size))
    for edge in edges:
        adjacency_matrix[edge[0], edge[1]] = 1
        adjacency_matrix[edge[1], edge[0]] = 1  # since it's undirected
    
    return c_size, np.array(features), adjacency_matrix

def process_smiles(smiles_list):
    """
    Process a list of SMILES strings into molecular graphs.
    Returns a list of graphs with their features and adjacency matrices.
    """
    graphs = []
    for smile in smiles_list:
        graph = smile_to_graph(smile)
        graphs.append(graph)
    return graphs

# 3. Generate and load contact maps
def pconsc4Prediction(aln_dir, output_dir):
    """
    Generate contact maps using pconsc4 and save them as .npy files.
    
    Args:
        aln_dir (str): Directory where the alignment files (.a3m) are stored.
        output_dir (str): Directory where the contact maps will be saved.
    """
    model = pconsc4.get_pconsc4()
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    file_list = os.listdir(aln_dir)
    for file in file_list:
        input_file = os.path.join(aln_dir, file)
        output_file = os.path.join(output_dir, file.split('.a3m')[0] + '.npy')
        
        if os.path.exists(output_file):
            continue
        
        try:
            print('Processing', input_file)
            pred = pconsc4.predict(model, input_file)
            np.save(output_file, pred['cmap'])
            print(output_file, 'completed.')
        except Exception as e:
            print(output_file, 'error:', e)

def load_contact_map(contact_map_dir, target_key):
    """
    Load the contact map for a given protein.
    The contact map should be saved as a .npy file, where the file name is target_key.npy.
    """
    contact_map_file = os.path.join(contact_map_dir, target_key + '.npy')
    contact_map = np.load(contact_map_file)
    return contact_map

def target_to_graph(target_key, target_sequence, contact_map_dir, aln_dir):
    """
    Convert a protein sequence into a graph using a contact map.
    Returns the number of residues, residue features, and edge index.
    """
    # Generate the contact map if it does not exist
    if not os.path.exists(os.path.join(contact_map_dir, target_key + '.npy')):
        print(f"Generating contact map for {target_key}...")
        pconsc4Prediction(aln_dir, contact_map_dir)
    
    # Load the contact map
    contact_map = load_contact_map(contact_map_dir, target_key)
    
    # Add identity matrix to ensure self-loops
    contact_map += np.eye(contact_map.shape[0])
    
    # Create edge index from the contact map
    index_row, index_col = np.where(contact_map >= 0.5)
    edge_index = np.array([index_row, index_col]).T
    
    # Generate feature matrix (example: one-hot encoding)
    features = np.array([[1, 0, 0, 0] for _ in target_sequence])  # Simplified example
    
    return len(target_sequence), features, edge_index

# 4. Integrate everything for processing the dataset
def process_dataset(file_path, contact_map_dir, aln_dir):
    # Load the dataset
    smiles, sequences, kiba_scores, protein_ids = load_dataset(file_path)
    
    # Process SMILES strings into molecular graphs
    molecular_graphs = process_smiles(smiles)
    
    # Process protein sequences into protein graphs
    protein_graphs = []
    for target_key, sequence in zip(protein_ids, sequences):
        protein_graph = target_to_graph(target_key, sequence, contact_map_dir, aln_dir)
        protein_graphs.append(protein_graph)
    
    return molecular_graphs, protein_graphs, kiba_scores

# Example usage
file_path = 'KIBA.csv'  # Replace with the path to your dataset
contact_map_dir = '/saved'  # Directory to save/load contact maps
aln_dir = '/saved'  # Directory containing the alignment files

molecular_graphs, protein_graphs, kiba_scores = process_dataset(file_path, contact_map_dir, aln_dir)

# Example output for the first molecule and protein pair
print("Molecular graph of first molecule:", molecular_graphs[0])
print("Protein graph of first protein:", protein_graphs[0])


ModuleNotFoundError: No module named 'pconsc4'

In [13]:
import numpy

# from tensorflow import tensorflow
# import keras
import cython
import pythran
import beniget
import gast
import h5py
import scipy

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [6]:
from pconsc4 import pconsc4

ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [None]:
model = pconsc4.get_pconsc4()