In [88]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MolFromSmiles

def load_and_preprocess_dataset(file_path):
    """
    Load the dataset from a CSV file, remove duplicate pairs of sequences and SMILES strings,
    and return the unique pairs along with their corresponding KIBA scores.
    Also returns the unique SMILES strings and unique protein sequences.
    """
    data = pd.read_csv(file_path)
    
    # Select relevant columns
    data = data[['compound_iso_smiles', 'target_sequence', 'Ki , Kd and IC50  (KIBA Score)']]
    
    # Remove duplicates based on SMILES and target sequence
    data_unique_pairs = data.drop_duplicates(subset=['compound_iso_smiles', 'target_sequence'])
    
    smiles = data_unique_pairs['compound_iso_smiles'].values
    sequences = data_unique_pairs['target_sequence'].values
    kiba_scores = data_unique_pairs['Ki , Kd and IC50  (KIBA Score)'].values
    
    # Get unique SMILES strings
    unique_smiles = data['compound_iso_smiles'].unique()
    
    # Get unique protein sequences
    unique_sequences = data['target_sequence'].unique()
    
    return smiles, sequences, kiba_scores, unique_smiles, unique_sequences

# Example usage
file_path = 'kiba.csv'
smiles, sequences, kiba_scores, unique_smiles, unique_sequences = load_and_preprocess_dataset(file_path)

print("Number of unique SMILES strings:", len(unique_smiles))
print("Number of unique protein sequences:", len(unique_sequences))
print("Total unique interactions:", len(kiba_scores))




Number of unique SMILES strings: 2068
Number of unique protein sequences: 229
Total unique interactions: 117657


In [56]:
import numpy as np
import networkx as nx
from rdkit import Chem
import torch
from torch_geometric.data import Data

def get_unique_atoms(smiles_list):
    """Extract unique atoms from a list of SMILES strings."""
    unique_atoms = set()
    for smile in smiles_list:
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print(f"Invalid SMILES string: {smile}")
            continue
        for atom in mol.GetAtoms():
            unique_atoms.add(atom.GetSymbol())
    return sorted(list(unique_atoms))

def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    graph = nx.Graph()
    for atom in mol.GetAtoms():
        graph.add_node(atom.GetIdx(), symbol=atom.GetSymbol(), atomic_num=atom.GetAtomicNum())
    for bond in mol.GetBonds():
        graph.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond_type=bond.GetBondType())
        graph.add_edge(bond.GetBeginAtomIdx(), bond.GetBeginAtomIdx(), bond_type='self')
        graph.add_edge(bond.GetEndAtomIdx(), bond.GetEndAtomIdx(), bond_type='self')
    return graph

def create_adjacency_matrix(graph):
    return nx.adjacency_matrix(graph)

def create_node_features(mol, unique_atoms):
    atom_features = []
    num_elements = len(unique_atoms)
    for atom in mol.GetAtoms():
        element = atom.GetSymbol()
        element_one_hot = [0] * num_elements
        if element in unique_atoms:
            element_one_hot[unique_atoms.index(element)] = 1

        degree = atom.GetDegree()
        degree_one_hot = [0] * 11
        if degree < 11:
            degree_one_hot[degree] = 1

        total_num_h = atom.GetTotalNumHs()
        total_num_h_one_hot = [0] * 11
        if total_num_h < 11:
            total_num_h_one_hot[total_num_h] = 1

        implicit_h = atom.GetNumImplicitHs()
        implicit_h_one_hot = [0] * 11
        if implicit_h < 11:
            implicit_h_one_hot[implicit_h] = 1

        aromatic = [1] if atom.GetIsAromatic() else [0]

        atom_features.append(element_one_hot + degree_one_hot + total_num_h_one_hot + implicit_h_one_hot + aromatic)

    return np.array(atom_features)

def process_data(smiles_list):
    unique_atoms = get_unique_atoms(smiles_list)  # Get unique atoms dynamically
    data_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            continue
        graph = smiles_to_graph(smiles)
        x = torch.tensor(create_node_features(mol, unique_atoms), dtype=torch.float)
        adj_matrix = create_adjacency_matrix(graph)
        edge_index = torch.tensor(adj_matrix.nonzero(), dtype=torch.long)
        data = Data(x=x, edge_index=edge_index)
        data.unique_atoms = unique_atoms  # Store unique atoms in the Data object
        data_list.append(data)  # Store the Data object directly
    return data_list

# Example usage:
smiles_list = unique_smiles  # Assuming unique_smiles is defined

molecular_data = process_data(smiles_list)


  return nx.adjacency_matrix(graph)


In [86]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

def plot_molecular_graph_with_features(data, filename="molecular_graph.png"):
    """
    Plot a molecular graph along with its adjacency matrix, displaying detailed node features.
    
    Parameters:
    - data: A Data object containing graph data (nodes and edges) and features.
    - filename: The filename to save the plot if using a non-GUI backend.
    """
    edge_index = data.edge_index.numpy()

    # Create a NetworkX graph from edge indices
    graph = nx.Graph()
    for i in range(edge_index.shape[1]):
        start, end = edge_index[:, i]
        graph.add_edge(start, end)

    # Get node features to use as labels
    node_features = data.x.numpy()
    num_nodes = node_features.shape[0]
    labels = {}
    for i in range(num_nodes):
        one_hot_encoded_element = node_features[i].tolist()

        element = "Unknown"
        degree = "Unknown"
        total_h = "Unknown"
        implicit_h = "Unknown"
        aromatic = "Unknown"

        if len(one_hot_encoded_element) >= 78:
            if 1 in one_hot_encoded_element[:44]:
                element = one_hot_encoded_element[:44].index(1)
            if 1 in one_hot_encoded_element[44:55]:
                degree = one_hot_encoded_element[44:55].index(1)
            if 1 in one_hot_encoded_element[55:66]:
                total_h = one_hot_encoded_element[55:66].index(1)
            if 1 in one_hot_encoded_element[66:77]:
                implicit_h = one_hot_encoded_element[66:77].index(1)
            aromatic = one_hot_encoded_element[77]

        labels[i] = f"Node {i + 1}\n" + f"Element: {element}\n" + \
                    f"Degree: {degree}\n" + \
                    f"Total H: {total_h}\n" + \
                    f"Implicit H: {implicit_h}\n" + \
                    f"Aromatic: {aromatic}"

    # Plot the adjacency matrix and the molecular graph
    plt.figure(figsize=(12, 6))

    # Plot adjacency matrix
    ax1 = plt.subplot(1, 2, 1)
    adj_matrix = nx.adjacency_matrix(graph).toarray()  # Convert to dense array to avoid FutureWarning
    ax1.imshow(adj_matrix, cmap='RdYlBu_r', interpolation='nearest')
    ax1.set_title('Adjacency Matrix with Node Symbols')
    node_labels = labels
    ax1.set_xticks(np.arange(len(node_labels)))
    ax1.set_yticks(np.arange(len(node_labels)))
    ax1.set_xticklabels(list(node_labels.keys()), rotation=90)
    ax1.set_yticklabels(list(node_labels.keys()))
    for i in range(len(node_labels)):
        for j in range(len(node_labels)):
            ax1.text(j, i, str(adj_matrix[i, j]), ha='center', va='center', color='black')

    # Plot molecular graph
    ax2 = plt.subplot(1, 2, 2)
    pos = nx.spring_layout(graph)
    nx.draw(graph, pos=pos, with_labels=True, labels=node_labels, ax=ax2, node_size=700, node_color='skyblue', font_size=8)
    nx.draw_networkx_edges(graph, pos)
    ax2.set_title('Molecular Graph')

    plt.tight_layout()

    # Save the plot to a file
    plt.savefig(filename)
    print(f"Plot saved as {filename}")
    
    # If running in a non-GUI backend, this won't display the plot, but saves the image.
    plt.show()

# Example usage
plot_molecular_graph_with_features(molecular_data[0], filename="molecular_graph.png")




Plot saved as molecular_graph.png


  plt.show()
