In [None]:
#molecule_dict = create_molecule_dict(csv_file_path)

In [30]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm


from scipy.stats import randint, uniform
from pandarallel import pandarallel
from sklearn.neighbors import NearestNeighbors
from scipy.stats import pearsonr
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import ptitprince as pt
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import ttest_ind, ttest_rel
from statannotations.Annotator import Annotator

import math
import sys
sys.path.append('/home/ss2686/JUMPCP')

import argparse
from scripts.evaluation_functions import evaluate_classifier, evaluate_regression, fold_error, optimize_threshold_j_statistic

# Initialize pandarallel for parallel processing
pandarallel.initialize()
pandarallel.initialize(progress_bar=True)

import gzip


data_path = '../data/processed_splits/'
# Define the path to your gzip-compressed image_features.csv.gz file
csv_file_path = '../data/JUMP_features/JUMP_features.csv.gz'


def create_molecule_dict(csv_file_path):
    molecule_dict = {}

    with gzip.open(csv_file_path, 'rt') as f:
        next(f)  # Skip the first line (header)
        for line in f:
            data = line.strip().split(',')
            smiles = data[0]
            features = np.array(data[1:299], dtype=float)
            molecule_dict[smiles] = features

    return molecule_dict

# Call create_molecule_dict once to create the dictionary
molecule_dict = create_molecule_dict(csv_file_path)

# Create a function to calculate Tanimoto similarities and means
def calculate_tanimoto_and_mean(row, combined_df, activity, knn, boolean_fingerprints):
    
    i = row.name
    if combined_df.iloc[i][activity] != 1:
        return None, None
    
    active_active_similarities = []
    active_inactive_similarities = []

    for j, index in enumerate(knn.kneighbors([boolean_fingerprints[i]])[1][0]):
        if i != index:
            similarity = 1 - knn.kneighbors([boolean_fingerprints[i]])[0][0][j]

            if combined_df.iloc[index][activity] == 1:
                active_active_similarities.append(similarity)
            elif combined_df.iloc[index][activity] == 0:
                active_inactive_similarities.append(similarity)

    if active_active_similarities:
        mean_active_active = np.median(sorted(active_active_similarities, reverse=True)[:5])
    else:
        mean_active_active = None

    if active_inactive_similarities:
        mean_active_inactive = np.median(sorted(active_inactive_similarities, reverse=True)[:5])
    else:
        mean_active_inactive = None

    return (1-mean_active_active), (1-mean_active_inactive)


from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance


def calculate_eucledian_and_mean(row, combined_df, activity, knn, descriptors):
    
    i = row.name
    if combined_df.iloc[i][activity] != 1:
        return None, None
    
    active_active_correlations = []
    active_inactive_correlations = []

    for j, index in enumerate(knn.kneighbors([descriptors[i]])[1][0]):
        if i != index:
            descriptor1 = descriptors[i]
            descriptor2 = descriptors[index]
            # Reshape data to fit the scaler's expected input
            x = [[i] for i in descriptor1]
            y = [[i] for i in descriptor2]
            # Normalize using Z-score normalization
            scaler = StandardScaler()
            x_normalized = scaler.fit_transform(x)
            y_normalized = scaler.transform(y)  # Use the same scaler to transform y
            # Flatten the data back to 1-dimensional arrays
            x_normalized = x_normalized.flatten()
            y_normalized = y_normalized.flatten()

            if combined_df.iloc[index][activity] == 1:
                # Compute Euclidean distance
                euclidean_dist = distance.euclidean(x_normalized, y_normalized)
                active_active_correlations.append(euclidean_dist)
                
            elif combined_df.iloc[index][activity] == 0:
                euclidean_dist = distance.euclidean(x_normalized, y_normalized)
                active_inactive_correlations.append(euclidean_dist)

    if active_active_correlations:
        mean_active_active = np.median(sorted(active_active_correlations, reverse=True)[:5])
    else:
        mean_active_active = None

    if active_inactive_correlations:
        mean_active_inactive = np.median(sorted(active_inactive_correlations, reverse=True)[:5])
    else:
        mean_active_inactive = None

    return (mean_active_active), (mean_active_inactive)


'''def calculate_pearson_and_mean(row, combined_df, activity, knn, descriptors):
    
    i = row.name
    if combined_df.iloc[i][activity] != 1:
        return None, None
    
    active_active_correlations = []
    active_inactive_correlations = []

    for j, index in enumerate(knn.kneighbors([descriptors[i]])[1][0]):
        if i != index:
            descriptor1 = descriptors[i]
            descriptor2 = descriptors[index]

            if combined_df.iloc[index][activity] == 1:
                correlation, _ = pearsonr(descriptor1, descriptor2)
                active_active_correlations.append(correlation)
            elif combined_df.iloc[index][activity] == 0:
                correlation, _ = pearsonr(descriptor1, descriptor2)
                active_inactive_correlations.append(correlation)

    if active_active_correlations:
        mean_active_active = np.median(sorted(active_active_correlations, reverse=True)[:])
    else:
        mean_active_active = None

    if active_inactive_correlations:
        mean_active_inactive = np.median(sorted(active_inactive_correlations, reverse=True)[:])
    else:
        mean_active_inactive = None

    return mean_active_active, mean_active_inactive
    
'''


def generate_cellpainting(smiles):
    return molecule_dict.get(smiles, np.zeros(298, dtype=float))

def generate_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    
    return np.array(fp)


results_fp = {}
results_cp = {}

results_significance = {}

# Initialize lists to store results
mean_tanimoto_active_active = []
mean_tanimoto_active_inactive = []

mean_eucledian_active_active= []
mean_eucledian_active_inactive= []

data = []

for dataset in os.listdir(data_path):   
    
    if dataset not in results_significance:
        results_significance[dataset] = {}
    
    
    if dataset != "PK_Lombardo":
        print(dataset)
        
        # Get all the file names for this dataset
        all_files = os.listdir(os.path.join(data_path, dataset))

        # Extract activity names by removing the _train.csv.gz or _test.csv.gz from file names
        activity_names = list(set([f.replace("_train.csv.gz", "").replace("_test.csv.gz", "") for f in all_files]))

        for activity in tqdm(activity_names, desc="Processing activities"):

            if activity not in results_significance[dataset]:
                results_significance[dataset][activity] = {}
            print(activity)

            train_path = os.path.join(data_path, dataset, f"{activity}_train.csv.gz")
            test_path = os.path.join(data_path, dataset, f"{activity}_test.csv.gz")

            train_df = pd.read_csv(train_path, compression='gzip')
            test_df = pd.read_csv(test_path, compression='gzip')

            # Combine train and test data
            combined_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
            
            print(len(combined_df))

            #STRUCTURAL
            
            # Generate Morgan fingerprints for the combined data
            fingerprints = combined_df['Standardized_SMILES'].parallel_apply(generate_fingerprints)
            fingerprints = np.array(fingerprints.to_list())

            threshold = 0.5  #Binarisation
            boolean_fingerprints = fingerprints > threshold
            #print("boolean_fingerprints complete")

            # Calculate Tanimoto similarity using Jaccard distance
            knn_fp = NearestNeighbors(n_neighbors=len(combined_df) - 1, metric='jaccard', n_jobs=1)  # Use Jaccard distance for Tanimoto similarity
            knn_fp.fit(boolean_fingerprints)
            #print("knn_fit complete")

            # Initialize lists to store mean similarities
            mean_tanimoto_active_active_activity = []
            mean_tanimoto_active_inactive_activity = []

            def apply_func_calculate_tanimoto_and_mean(row):
                return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)

            # Apply the function to each row of combined_df in parallel
            results_fp = combined_df.parallel_apply(apply_func_calculate_tanimoto_and_mean, axis=1)

            # Separate the results into two lists
            mean_tanimoto_active_active = [result[0] for result in results_fp if result[0] is not None]
            mean_tanimoto_active_inactive = [result[1] for result in results_fp if result[1] is not None]

            # Raincloud plots
            #pal = "Set2"
            #sns.set(rc={'figure.figsize':(10,5), "figure.dpi":200}, font_scale=1)
            #sns.set_style("white")

            df_plot = pd.DataFrame({
                'Category': ['Active vs Active'] * len(mean_tanimoto_active_active) + ['Active vs Inactive'] * len(mean_tanimoto_active_inactive),
                'Mean Tanimoto Distance': mean_tanimoto_active_active + mean_tanimoto_active_inactive
            })

            '''
            pal = "colorblind"
            sns.set_style("white")

            ax=pt.half_violinplot(x = 'Mean Tanimoto Distance', y = 'Category', data = df_plot, palette = pal,
                 bw = .2, cut = 0.,scale = "area", width = .6, 
                 inner = None, orient = 'h')

            ax=sns.stripplot( x = 'Mean Tanimoto Distance', y = 'Category', data = df_plot, palette = pal,
                  edgecolor = "white",size = 3, jitter = 1, zorder = 0,
                  orient = 'h')

            ax=sns.boxplot( x = 'Mean Tanimoto Distance', y = 'Category', data = df_plot, color = "black",
                  width = .15, zorder = 10, showcaps = True,
                  boxprops = {'facecolor':'none', "zorder":10}, showfliers=True,
                  whiskerprops = {'linewidth':2, "zorder":10}, 
                  saturation = 1, orient = 'h')

            # Add significance annotations
            annotator = Annotator(ax, data=df_plot, y='Category', x='Mean Tanimoto Distance',
                                  pairs=[("Active vs Active", "Active vs Inactive")],
                                  order=['Active vs Active', 'Active vs Inactive'],
                                 orient='h')

            annotator.configure(test='t-test_ind', text_format='star', loc='outside')
            annotator.apply_and_annotate()
            '''

            # Extract data for both categories
            active_active_values = df_plot[df_plot['Category'] == 'Active vs Active']['Mean Tanimoto Distance']
            active_inactive_values = df_plot[df_plot['Category'] == 'Active vs Inactive']['Mean Tanimoto Distance']
            
            # Perform t-test
            t_stat, p_value = ttest_ind(active_active_values, active_inactive_values)
            # Print p-value
            results_significance[dataset][activity]['structural'] = {'t-statistic': t_stat, 'p-value': p_value}

            
            # Customize the plot
            #plt.xlabel("Mean Tanimoto Distance")
            #plt.ylabel("")
            #plt.xticks(rotation=0)
            #plt.show()
            
            
            #CELL PAINTING
            
            # Generate Cell Painting descriptors for the combined data
            cp_descriptors = combined_df['Standardized_SMILES'].parallel_apply(generate_cellpainting)
            cp_descriptors = np.array(cp_descriptors.to_list())
            
            # Initialize the K-nearest neighbors model for Eucledian distance
            knn_cp = NearestNeighbors(n_neighbors=len(combined_df) - 1, metric='correlation', n_jobs=1)  # Use Euclidean distance for correlations
            knn_cp.fit(cp_descriptors)  
            
            # Initialize lists to store mean correlations
            mean_eucledian_active_active_activity = []
            mean_eucledian_active_inactive_activity = []
            
            def apply_func_calculate_eucledian_and_mean(row):
                return calculate_eucledian_and_mean(row, combined_df, activity, knn_cp, cp_descriptors)

            # Apply the function to each row of combined_df in parallel
            results_cp = combined_df.parallel_apply(apply_func_calculate_eucledian_and_mean, axis=1)
            
            # Separate the results into two lists
            mean_eucledian_active_active = [result[0] for result in results_cp if result[0] is not None]
            mean_eucledian_active_inactive = [result[1] for result in results_cp if result[1] is not None]
            
            # Raincloud plots

            #print(activity)

            #pal = "Set2"
            #sns.set(rc={'figure.figsize':(10,5), "figure.dpi":200}, font_scale=1)
            #sns.set_style("white")

            df_plot = pd.DataFrame({
                'Category': ['Active vs Active'] * len(mean_eucledian_active_active) + ['Active vs Inactive'] * len(mean_eucledian_active_inactive),
                'Mean Eucledian Distance': mean_eucledian_active_active + mean_eucledian_active_inactive
            })

            '''pal = "colorblind"
            sns.set_style("white")

            ax=pt.half_violinplot( x = 'Mean Eucledian Distance', y = 'Category', data = df_plot, palette = pal,
                 bw = .2, cut = 0.,scale = "area", width = .6, 
                 inner = None, orient = 'h')

            ax=sns.stripplot( x = 'Mean Eucledian Distance', y = 'Category', data = df_plot, palette = pal,
                  edgecolor = "white",size = 3, jitter = 1, zorder = 0,
                  orient = 'h')

            ax=sns.boxplot( x = 'Mean Eucledian Distance', y = 'Category', data = df_plot, color = "black",
                  width = .15, zorder = 10, showcaps = True,
                  boxprops = {'facecolor':'none', "zorder":10}, showfliers=True,
                  whiskerprops = {'linewidth':2, "zorder":10}, 
                  saturation = 1, orient = 'h')

            # Add significance annotations
            annotator = Annotator(ax, data=df_plot, y='Category', x='Mean Eucledian Distance',
                                  pairs=[("Active vs Active", "Active vs Inactive")],
                                  order=['Active vs Active', 'Active vs Inactive'],
                                 orient='h')

            annotator.configure(test='t-test_ind', text_format='star', loc='outside')
            annotator.apply_and_annotate()
            
            '''
            
            # Extract data for both categories
            active_active_values = df_plot[df_plot['Category'] == 'Active vs Active']['Mean Eucledian Distance']
            active_inactive_values = df_plot[df_plot['Category'] == 'Active vs Inactive']['Mean Eucledian Distance']

            # Perform t-test
            t_stat, p_value = ttest_ind(active_active_values, active_inactive_values)
            # Print p-value
            results_significance[dataset][activity]['image'] = {'t-statistic': t_stat, 'p-value': p_value}


            # Customize the plot
            #plt.xlabel("Mean Eucledian Distance")
            #plt.ylabel("")
            #plt.xticks(rotation=0)
            #plt.show()

            # Create a list to hold the rows of the dataframe


            # Iterate through the dictionary to extract the data
            for task, activities in results_significance.items():
                for activity, features in activities.items():
                    for featureset, values in features.items():
                        row = {
                            'dataset': dataset,
                            'activity': activity,
                            'featureset': featureset,
                            't-statistic': values['t-statistic'],
                            'p-value': values['p-value']
                        }
                        data.append(row)

            # Convert the list of rows to a dataframe
            df = pd.DataFrame(data)
            df.to_csv("Plot_comparsions_similarity.csv", index=False)



INFO: Pandarallel will run on 76 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
toxcast


Processing activities:   0%|                            | 0/330 [00:00<?, ?it/s]

TOX21_p53_BLA_p2_ch1


Processing activities:   0%|                  | 1/330 [00:39<3:36:02, 39.40s/it]

CEETOX_H295R_TESTO_dn


Processing activities:   1%|                  | 2/330 [00:42<1:38:56, 18.10s/it]

ATG_PPRE_CIS_up


Processing activities:   1%|▏                 | 3/330 [00:50<1:14:10, 13.61s/it]

TOX21_MMP_ratio_up


Processing activities:   1%|▏                   | 4/330 [00:57<58:12, 10.71s/it]

BSK_CASM3C_MIG_down


Processing activities:   2%|▎                   | 5/330 [01:00<44:16,  8.17s/it]

ATG_Sox_CIS_up


Processing activities:   2%|▎                   | 6/330 [01:06<39:31,  7.32s/it]

TOX21_ARE_BLA_Agonist_ch1


Processing activities:   2%|▍                 | 7/330 [01:31<1:09:58, 13.00s/it]

BSK_4H_uPAR_down


Processing activities:   2%|▍                   | 8/330 [01:35<55:11, 10.29s/it]

Tanguay_ZF_120hpf_MORT_up


Processing activities:   3%|▌                   | 9/330 [01:38<43:26,  8.12s/it]

ATG_STAT3_CIS_dn


Processing activities:   3%|▌                  | 10/330 [01:43<37:37,  7.05s/it]

ATG_Oct_MLP_CIS_up


Processing activities:   3%|▋                  | 11/330 [01:51<39:15,  7.39s/it]

CEETOX_H295R_ANDR_dn


Processing activities:   4%|▋                  | 12/330 [01:54<31:54,  6.02s/it]

OT_ER_ERbERb_1440


Processing activities:   4%|▋                  | 13/330 [01:58<28:52,  5.46s/it]

TOX21_ERa_BLA_Antagonist_ratio


Processing activities:   4%|▋                | 14/330 [03:16<2:24:08, 27.37s/it]

BSK_SAg_Proliferation_down


Processing activities:   5%|▊                | 15/330 [03:20<1:46:55, 20.37s/it]

CEETOX_H295R_OHPROG_up


Processing activities:   5%|▊                | 16/330 [03:24<1:19:16, 15.15s/it]

ATG_BRE_CIS_dn


Processing activities:   5%|▉                | 17/330 [03:28<1:02:28, 11.98s/it]

ATG_RORE_CIS_up


Processing activities:   5%|█                  | 18/330 [03:36<56:00, 10.77s/it]

BSK_SAg_SRB_down


Processing activities:   6%|█                  | 19/330 [03:40<45:25,  8.76s/it]

TOX21_PPARg_BLA_Agonist_ch2


Processing activities:   6%|█                | 20/330 [04:18<1:30:59, 17.61s/it]

OT_AR_ARSRC1_0480


Processing activities:   6%|█                | 21/330 [04:23<1:10:50, 13.76s/it]

BSK_hDFCGF_MMP1_down


Processing activities:   7%|█▎                 | 22/330 [04:28<56:33, 11.02s/it]

ATG_PPARd_TRANS_up


Processing activities:   7%|█▎                 | 23/330 [04:33<47:15,  9.23s/it]

TOX21_ERa_BLA_Antagonist_ch1


Processing activities:   7%|█▏               | 24/330 [05:33<2:04:50, 24.48s/it]

ATG_TA_CIS_up


Processing activities:   8%|█▎               | 25/330 [05:39<1:35:44, 18.83s/it]

BSK_LPS_MCP1_down


Processing activities:   8%|█▎               | 26/330 [05:43<1:13:06, 14.43s/it]

TOX21_Aromatase_Inhibition


Processing activities:   8%|█▎               | 26/330 [05:49<1:08:12, 13.46s/it]Process ForkPoolWorker-9540:
Process ForkPoolWorker-9491:
Process ForkPoolWorker-9500:
Process ForkPoolWorker-9535:
Process ForkPoolWorker-9531:
Process ForkPoolWorker-9541:
Process ForkPoolWorker-9557:
Process ForkPoolWorker-9554:



KeyboardInterrupt: 

Process ForkPoolWorker-9530:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-9489:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
Traceback (most recent call last):
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
Traceback (most recent call last):
Process ForkPoolWorker-9520:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 108, in run
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
  File "

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
Traceback (most recent call last):
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/frame.py", line 8833, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 851, in apply_standard
    results, res_index = self.apply_series_generator()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 851, in apply_standard
    results, res_index = self.apply_series_generator()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 851, in apply_standard
    results, res_ind

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 796, in kneighbors
    chunked_results = list(
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)
Traceback (most recent call last):
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
Process ForkPoolWorker-9507:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 796, in kneighbors
    chunked_results = list(
Traceback (most recent call last):
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
Process ForkPoolWorker-9499:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/n

Process ForkPoolWorker-9515:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 108, in run
Process ForkPoolWorker-9536:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
Traceback (most recent call last):
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 125, in worker
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/data_types/dataframe.py", line 35, in work
    return data.apply(
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)
Process ForkPoolWorker-9539:
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerp

Process ForkPoolWorker-9544:
Traceback (most recent call last):
Process ForkPoolWorker-9483:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
Process ForkPoolWorker-9492:
Process ForkPoolWorker-9549:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 51, in starmapstar
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 125, in worker
Process ForkPoolWorker-9496:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 108, in run
  File "/home/ss2686/mini

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/frame.py", line 8833, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)
Traceback (most recent call last):
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/scipy/spatial/distance.py", line 2947, in cdist
    return cdist_fn(XA, XB, out=out, **kwargs)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/data_types/dataframe.py", line 35, in work
    return dat

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
KeyboardInterrupt
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 125, in worker
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/data_types/dataframe.py", line 35, in work
    return data.apply(
Process ForkPoolWorker-9551:
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/frame.py", line 8833

Traceback (most recent call last):
  File "/tmp/ipykernel_2666452/3804751970.py", line 65, in calculate_tanimoto_and_mean
    similarity = 1 - knn.kneighbors([boolean_fingerprints[i]])[0][0][j]
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 108, in run
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/data_types/dataframe.py", line 35, in work
    return data.apply(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/metrics/pairwise.py", line 1989, in pairwise_distances
    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 125, in worker
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 125, in worker
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
  Fi

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/metrics/pairwise.py", line 1530, in _parallel_pairwise
    return func(X, Y, **kwds)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 51, in starmapstar
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 51, in starmapstar
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/frame.py", line 8833, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 108, in run
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 851, in apply_standard
    results, res_index = self.apply_series_generator()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 51, in starmapstar
  File "/home/s

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 125, in worker
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 867, in apply_series_generator
    results[i] = self.f(v)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return ca

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/data_types/dataframe.py", line 35, in work
    return data.apply(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/data_types/dataframe.py", line 35, in work
    return data.apply(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 51, in starmapstar
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/data_types/dataframe.py", line 35, in work
    return data.apply(
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)
  File "/tmp/ipykernel_2666452/3804751970.py", line 65, in calculate_tanimoto_and_mean
    similarity = 1 - knn.kneighbors([boolean_fingerprints[i]])[0][0][j]
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/frame.py", line 8833, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/frame.py", line 8833, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/frame.py", line 8833, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 796, in kneighbors
    chunked_results = list(
  File "/tmp/ipykernel_2666452/3804751970.py", line 65, in calculate_tanimoto_and_mean
    similarity = 1 - knn.kneighbors([boolean_fingerprints[i]])[0][

  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_stand

  File "/tmp/ipykernel_2666452/3804751970.py", line 65, in calculate_tanimoto_and_mean
    similarity = 1 - knn.kneighbors([boolean_fingerprints[i]])[0][0][j]
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/frame.py", line 8833, in apply
    return op.apply().__finalize__(self, method="apply")
Traceback (most recent call last):
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/data_types/dataframe.py", line 35, in work
    return data.apply(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 796, in kneighbors
    chunked_results = list(
  File "/tmp/ipykernel_2666452/3804751970.py", line 65, in calculate_tanimoto_and_mean
    similarity = 1 - knn.kneighbors([boolean_fingerprints[i]])[0][0][j]
 

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 851, in apply_standard
    results, res_index = self.apply_series_generator()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/generic.py", line 3943, in _set_is_copy
    self._is_copy = weakref.ref(ref)
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/metrics/pairwise.py", line 1530, in _parallel_pairwise
    return func(X, Y, **kwds)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/metrics/pairwise.py", line 1989, in pairwise_distances
    return _parallel_pai

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 727, in apply
    return self.apply_standard()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/frame.py", line 8833, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/tmp/ipykernel_2666452/3804751970.py", line 65, in calculate_tanimoto_and_mean
    similarity = 1 - knn.kneighbors([boolean_fingerprints[i]])[0][0][j]
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 851, in apply_standard
    results, res_index = self.apply_series_generator()
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandas/core/apply.py", line 867, in apply_series_generator
    results[i] = self.f(v)
  File "/

  File "/tmp/ipykernel_2666452/3804751970.py", line 65, in calculate_tanimoto_and_mean
    similarity = 1 - knn.kneighbors([boolean_fingerprints[i]])[0][0][j]
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/scipy/spatial/distance.py", line 2947, in cdist
    return cdist_fn(XA, XB, out=out, **kwargs)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/scipy/spatial/distance.py", line 1676, in __call__
    cdist_fn(XA, XB, dm, **kwargs)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/scipy/spatial/distance.py", line 2947, in cdist
    return cdist_fn(XA, XB, out=out, **kwargs)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/metrics/pairwise.py", line 1817, in pairwise_distances_chunked
    D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/metrics/pairw

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 796, in kneighbors
    chunked_results = list(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/scipy/spatial/distance.py", line 2947, in cdist
    return cdist_fn(XA, XB, out=out, **kwargs)
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/multiprocessing/pool.py", line 51, in starmapstar
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/metrics/pairwise.py", line 1817, in pairwise_distances_chunked
    D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/metrics/pairwise.py", line 1817, in pair

  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/tmp/ipykernel_2666452/3804751970.py", line 245, in apply_func_calculate_tanimoto_and_mean
    return calculate_tanimoto_and_mean(row, combined_df, activity, knn_fp, boolean_fingerprints)
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 796, in kneighbors
    chunked_results = list(
  File "/home/ss2686/miniconda3/envs/my-rdkit-env/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 796, in kneighbors
    chunked_results = list(
  File "/home/ss2686/min

In [None]:
combined_df

In [None]:
def apply_func_calculate_eucledian_and_mean(row):
                return calculate_eucledian_and_mean(row, combined_df, activity, knn_cp, cp_descriptors)

# Apply the function to each row of combined_df in parallel
results_fp = combined_df.parallel_apply(apply_func_calculate_eucledian_and_mean, axis=1)