In [1]:
import pandas as pd

def load_dataset(file_path):
    """
    Load the dataset from a CSV file.
    Expected columns: 'compound_iso_smiles', 'ProteinID', 'target_sequence', 'Ki , Kd and IC50  (KIBA Score)'.
    """
    data = pd.read_csv(file_path)
    smiles = data['compound_iso_smiles'].values
    sequences = data['target_sequence'].values
    protein_ids = data['ProteinID'].values
    kiba_scores = data['Ki , Kd and IC50  (KIBA Score)'].values
    return smiles, sequences, protein_ids, kiba_scores

# Example usage
file_path = 'Kiba.csv'  # Replace with the actual path to your KIBA CSV file
smiles, sequences, protein_ids, kiba_scores = load_dataset(file_path)


In [18]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MolFromSmiles

def load_and_preprocess_dataset(file_path):
    """
    Load the dataset from a CSV file, remove duplicate pairs of sequences and SMILES strings,
    and return the unique pairs along with their corresponding KIBA scores.
    Also returns the unique SMILES strings and unique protein sequences.
    """
    data = pd.read_csv(file_path)
    
    # Select relevant columns
    data = data[['compound_iso_smiles', 'target_sequence', 'Ki , Kd and IC50  (KIBA Score)']]
    
    # Remove duplicates based on SMILES and target sequence
    data_unique_pairs = data.drop_duplicates(subset=['compound_iso_smiles', 'target_sequence'])
    
    smiles = data_unique_pairs['compound_iso_smiles'].values
    sequences = data_unique_pairs['target_sequence'].values
    kiba_scores = data_unique_pairs['Ki , Kd and IC50  (KIBA Score)'].values
    
    # Get unique SMILES strings
    unique_smiles = data['compound_iso_smiles'].unique()
    
    # Get unique protein sequences
    unique_sequences = data['target_sequence'].unique()
    
    return smiles, sequences, kiba_scores, unique_smiles, unique_sequences

# Example usage
file_path = 'kiba.csv'
smiles, sequences, kiba_scores, unique_smiles, unique_sequences = load_and_preprocess_dataset(file_path)

print("Number of unique SMILES strings:", len(unique_smiles))
print("Number of unique protein sequences:", len(unique_sequences))
print("Total unique interactions:", len(kiba_scores))


Number of unique SMILES strings: 2068
Number of unique protein sequences: 229
Total unique interactions: 117657
