In [15]:
#!/usr/bin/env python
# coding: utf-8

import os
import sys
sys.path.append('/home/ss2686/03_DICTrank')

from scripts.stratified_split_helper import stratified_data_split

import pandas as pd
import pickle

# Loading the dictionary

pickle_file_path = '../01_standardise_datasets/activity_columns_mapping_selected.pkl'

import pickle

# Specify the file path to the pickle file
file_ext_DICT_path_pkl = "smiles_list_ext_DICT_test.pkl"  # Replace "your_file.pkl" with the actual file path

# Load the data from the pickle file into a list
with open(file_ext_DICT_path_pkl, "rb") as file:
    smiles_list_ext_DICT = pickle.load(file)
     
    # Loading the dictionary
with open(pickle_file_path, 'rb') as file:
    activity_columns_mapping = pickle.load(file)
    
def process_datasets(directory='../data/binarised/'):
    datasets = {}
    splits = {}
    
    # Load datasets from given directory
    for foldername in os.listdir(directory):
        
        if not foldername.startswith('.'):  # Ignore folders starting with a dot
            
            #print(foldername)
            file_path = os.path.join(directory, foldername, f"{foldername}_binarised.csv.gz")

            if os.path.exists(file_path):
                datasets[foldername] = pd.read_csv(file_path, compression='gzip')
            else:
                print(f"No matching file found for folder: {foldername}")

    # Split for each dataset and each activity column
    for name, df in datasets.items():
        activity_cols = activity_columns_mapping.get(name, [])
        dataset_splits = {}
        
        for col in activity_cols:
            if name in ["cardiotox_with_sider_inactives", "cardiotox_with_sider_actives", 
                         "cardiotox_with_sider_all", "DICTrank"] and smiles_list_ext_DICT:
                
                print("Using ext smiles 65 tox and 25 nontox for this dataset ", name)
                test_df = df[df['Standardized_SMILES'].isin(smiles_list_ext_DICT)]
                print(test_df[col].value_counts())
                
                train_df = df[~df['Standardized_SMILES'].isin(smiles_list_ext_DICT)]
                
            else:
                print("Using 10% train test stratified split for this dataset ", name)
                
                train_df, test_df = stratified_data_split(df, activity_col=col, dataset_name=name)
                print(test_df[col].value_counts())
                
            # Filter the columns for saving
            cols_to_keep = ['Standardized_SMILES', 'Standardized_InChI', col]
            train_df = train_df[cols_to_keep]
            test_df = test_df[cols_to_keep]
            
            dataset_splits[col] = {'train': train_df, 'test': test_df}

        splits[name] = dataset_splits
    
    # Save the splits to new directories
    output_dir = "../data/processed_binarised__splits/"
    # Ensure the directory exists, if not, create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Iterate through each dataset name, activities, and their corresponding splits
    for dataset_name, activities in splits.items():
        print(dataset_name)
        for activity, split_data in activities.items():
            for data_type, split_df in split_data.items():
                # Create a path for the dataset if it doesn't exist
                dataset_path = os.path.join(output_dir, dataset_name)
                if not os.path.exists(dataset_path):
                    os.makedirs(dataset_path)

                # Define the full output path for the split dataframe
                output_path = os.path.join(dataset_path, f"{activity}_{data_type}.csv.gz")

                # Save the dataframe
                split_df.to_csv(output_path, index=False, compression='gzip')

if __name__ == '__main__':
    process_datasets()

Using ext smiles 65 tox and 25 nontox for this dataset  cardiotox_with_sider_inactives
1    65
0    25
Name: Cardiotox (with SIDER inactives), dtype: int64
Using ext smiles 65 tox and 25 nontox for this dataset  cardiotox_with_sider_actives
1    65
0    25
Name: Cardiotox (with SIDER actives), dtype: int64
Using ext smiles 65 tox and 25 nontox for this dataset  cardiotox_with_sider_all
1    65
0    25
Name: Cardiotox (with SIDER all), dtype: int64
Using 10% train test stratified split for this dataset  sider_cardiacdisorders
1    93
0    40
Name: Cardiac disorders, dtype: int64
Using ext smiles 65 tox and 25 nontox for this dataset  DICTrank
1    65
0    25
Name: DICTrank, dtype: int64
cardiotox_with_sider_inactives
sider
cardiotox_with_sider_actives
cardiotox_with_sider_all
sider_cardiacdisorders
DICTrank


In [16]:
from tqdm import tqdm
data_path = '../data/processed_binarised__splits/'


for dataset in os.listdir(data_path):

    # Exclude hidden files or directories like .ipynb_checkpoints
    if dataset.startswith('.'):
        continue
    print(dataset)

    # Get all the file names for this dataset
    all_files = os.listdir(os.path.join(data_path, dataset))
    
    

    # Extract activity names by removing the _train.csv.gz or _test.csv.gz from file names
    activity_names = list(set([f.replace("_train.csv.gz", "").replace("_test.csv.gz", "")  for f in all_files if not f.startswith(".ipynb_checkpoints")]))

    print(activity_names)
    
    for activity in tqdm(activity_names, desc="Processing activities"):
        
        train_path = os.path.join(data_path, dataset, f"{activity}_train.csv.gz")
        test_path = os.path.join(data_path, dataset, f"{activity}_test.csv.gz")

        train_df = pd.read_csv(train_path, compression='gzip')
        test_df = pd.read_csv(test_path, compression='gzip')
        
        print(len(train_df))
        print(train_df[activity].value_counts())
        print(len(test_df))
        print(test_df[activity].value_counts())


cardiotox_with_sider_inactives
['Cardiotox (with SIDER inactives)']


Processing activities: 100%|█████████████████████| 1/1 [00:00<00:00, 135.57it/s]


1163
1    677
0    486
Name: Cardiotox (with SIDER inactives), dtype: int64
90
1    65
0    25
Name: Cardiotox (with SIDER inactives), dtype: int64
cardiotox_with_sider_actives
['Cardiotox (with SIDER actives)']


Processing activities: 100%|█████████████████████| 1/1 [00:00<00:00, 129.01it/s]


1243
1    990
0    253
Name: Cardiotox (with SIDER actives), dtype: int64
90
1    65
0    25
Name: Cardiotox (with SIDER actives), dtype: int64
cardiotox_with_sider_all
['Cardiotox (with SIDER all)']


Processing activities: 100%|█████████████████████| 1/1 [00:00<00:00, 119.09it/s]


1476
1    990
0    486
Name: Cardiotox (with SIDER all), dtype: int64
90
1    65
0    25
Name: Cardiotox (with SIDER all), dtype: int64
sider_cardiacdisorders
['Cardiac disorders']


Processing activities: 100%|█████████████████████| 1/1 [00:00<00:00, 133.64it/s]


1189
1    829
0    360
Name: Cardiac disorders, dtype: int64
133
1    93
0    40
Name: Cardiac disorders, dtype: int64
DICTrank
['DICTrank']


Processing activities: 100%|█████████████████████| 1/1 [00:00<00:00, 130.34it/s]

930
1    677
0    253
Name: DICTrank, dtype: int64
90
1    65
0    25
Name: DICTrank, dtype: int64



