In [8]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, matthews_corrcoef, average_precision_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline


def generate_fingerprints(smiles_list):
    fps = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr)
    return np.array(fps)

def evaluate_classifier(true_labels, predictions, probs):
    ba 
    auc = roc_auc_score(true_labels, probs)
    mcc = matthews_corrcoef(true_labels, predictions)
    avg_precision = average_precision_score(true_labels, probs)
    tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()
    spe = tn / (tn + fp)
    sen = tp / (tp + fn)
    ba = (spe + sen)/2
    return {'Held_out_TP': tp, 'Held_out_TN': tn,
            'Held_out_FP': fp, 'Held_out_FN': fn,
            'Held_out_BA': ba,
            'Held_out_AUC': auc, 'Held_out_MCC': mcc, 
            'Held_out_AUCPR': avg_precision, 'Held_out_Specificity': spe,
            'Held_out_Sensitivity': sen}

def fold_error(true_values, predictions):
    ratio = predictions / true_values
    adjusted_ratio = np.where(ratio < 1, 1/ratio, ratio)
    return adjusted_ratio

def evaluate_regression(true_values, predictions):
    rmse = np.sqrt(mean_squared_error(true_values, predictions))
    r2 = np.corrcoef(true_values, predictions)[0, 1] ** 2
    ratio = predictions / true_values
    avg_fold_error = np.mean(fold_error(true_values, predictions))

    return {'Held_out_R2': r2, 'Held_out_RMSE': rmse, "Held_out_average_fold_error": avg_fold_error}

# Path where your data is stored
data_path = '../data/processed_splits/'

results = {}

# Assuming PK dataset is regression and others are classification
for dataset in os.listdir(data_path):
    print(dataset)
    
    # Get all the file names for this dataset
    all_files = os.listdir(os.path.join(data_path, dataset))

    # Extract activity names by removing the _train.csv.gz or _test.csv.gz from file names
    activity_names = list(set([f.replace("_train.csv.gz", "").replace("_test.csv.gz", "") for f in all_files]))

    for activity in tqdm(activity_names, desc="Processing activities"):
        print(activity)
        
        train_path = os.path.join(data_path, dataset, f"{activity}_train.csv.gz")
        test_path = os.path.join(data_path, dataset, f"{activity}_test.csv.gz")

        train_df = pd.read_csv(train_path, compression='gzip')
        test_df = pd.read_csv(test_path, compression='gzip')

        X_train = generate_fingerprints(train_df['Standardized_SMILES'])
        X_test = generate_fingerprints(test_df['Standardized_SMILES'])

        y_train = train_df[activity]
        y_test = test_df[activity]

        if dataset == "PK_Lombardo":
            model = RandomForestRegressor(n_jobs=-1)
            model.fit(X_train, y_train)
            predictions_train = model.predict(X_train)
            predictions_test = model.predict(X_test)

            cv_scores = cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5, scoring='r2')

            results[activity] = {
                'CV_R2_mean': np.mean(cv_scores),
                'CV_R2_std': np.std(cv_scores),
                **evaluate_regression(y_test, predictions_test)
            }
        else:
            model = RandomForestClassifier(n_jobs=-1)
            model.fit(X_train, y_train)
            predictions_train = model.predict(X_train)
            predictions_test = model.predict(X_test)
            probs_test = model.predict_proba(X_test)[:, 1]

            cv_scores = cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5, scoring='roc_auc')

            results[activity] = {
                'CV_AUC_mean': np.mean(cv_scores),
                'CV_AUC_std': np.std(cv_scores),
                **evaluate_classifier(y_test, predictions_test, probs_test)
            }
            
        # Save results at each step
        pd.DataFrame(results).T.to_csv('./structural_model_results.csv')
            
        #break
    #break

# Save results
pd.DataFrame(results).T.to_csv('./structural_model_results.csv')

toxcast
ATG_TGFb_CIS_dn
OT_ER_ERbERb_0480
BSK_LPS_SRB_down
BSK_hDFCGF_EGFR_down
ATG_VDRE_CIS_up
CEETOX_H295R_11DCORT_dn
CEETOX_H295R_ESTRADIOL_up
Tanguay_ZF_120hpf_SNOU_up
ATG_GATA_CIS_dn
BSK_CASM3C_HLADR_down
BSK_CASM3C_uPAR_down
Tanguay_ZF_120hpf_MORT_up
ATG_E_Box_CIS_up
BSK_LPS_MCSF_down
TOX21_ERa_BLA_Antagonist_ch1
ATG_TAL_CIS_up
BSK_KF3CT_IP10_down
TOX21_NFkB_BLA_agonist_ch2
NCCT_QuantiLum_inhib_2_dn
APR_HepG2_MitoMass_72h_dn
BSK_3C_IL8_down
APR_HepG2_StressKinase_24h_up
BSK_KF3CT_TIMP2_down
NVS_NR_bER
BSK_hDFCGF_IP10_down
BSK_hDFCGF_TIMP1_down
ATG_DR4_LXR_CIS_dn
Tanguay_ZF_120hpf_PE_up
CEETOX_H295R_ANDR_dn
TOX21_HSE_BLA_agonist_ch2
Tanguay_ZF_120hpf_JAW_up
TOX21_GR_BLA_Agonist_ch1
APR_HepG2_CellLoss_24h_dn
ATG_PPARd_TRANS_up
NVS_NR_hPPARg
TOX21_VDR_BLA_Agonist_viability
ATG_RXRb_TRANS_up
APR_HepG2_MitoMembPot_72h_dn
TOX21_p53_BLA_p5_ch1
BSK_LPS_MCP1_down
TOX21_AutoFluor_HEPG2_Media_blue
BSK_KF3CT_MMP9_down
ATG_CRE_CIS_up
TOX21_p53_BLA_p2_viability
APR_HepG2_CellCycleArrest_24h_dn

KeyboardInterrupt: 

In [14]:
pd.DataFrame(results).T.to_csv('./structural_model_results.csv')

In [17]:
df = pd.read_csv("structural_model_results.csv")
df

Unnamed: 0.1,Unnamed: 0,CV_AUC_mean,CV_AUC_std,Held_out_TP,Held_out_TN,Held_out_FP,Held_out_FN,Held_out_AUC,Held_out_MCC,Held_out_AUCPR,Held_out_Specificity,Held_out_Sensitivity
0,ATG_TGFb_CIS_dn,0.665075,0.160983,0.0,137.0,0.0,5.0,0.468613,0.000000,0.036171,1.000000,0.000000
1,OT_ER_ERbERb_0480,0.756698,0.080058,4.0,72.0,0.0,9.0,0.948184,0.522976,0.774826,1.000000,0.307692
2,BSK_LPS_SRB_down,0.676473,0.049920,3.0,46.0,6.0,16.0,0.655870,0.056566,0.396557,0.884615,0.157895
3,BSK_hDFCGF_EGFR_down,0.521239,0.096812,0.0,64.0,2.0,5.0,0.531818,-0.046860,0.113358,0.969697,0.000000
4,ATG_VDRE_CIS_up,0.727780,0.045614,13.0,88.0,10.0,31.0,0.687152,0.242767,0.465068,0.897959,0.295455
...,...,...,...,...,...,...,...,...,...,...,...,...
252,TOX21_MMP_ratio_down,0.772067,0.061959,8.0,158.0,2.0,32.0,0.856641,0.344124,0.627194,0.987500,0.200000
253,CEETOX_H295R_OHPREG_up,0.866917,0.139270,1.0,18.0,1.0,5.0,0.684211,0.179520,0.457388,0.947368,0.166667
254,NVS_NR_hER,0.706179,0.089204,6.0,43.0,2.0,12.0,0.780247,0.391965,0.621444,0.955556,0.333333
255,ATG_ERE_CIS_dn,0.637666,0.072744,0.0,131.0,0.0,11.0,0.523942,0.000000,0.172752,1.000000,0.000000


In [20]:
df["BA"] = (df["Held_out_Specificity"]+df["Held_out_Sensitivity"])/2

In [23]:
df.sort_values("BA")

Unnamed: 0.1,Unnamed: 0,CV_AUC_mean,CV_AUC_std,Held_out_TP,Held_out_TN,Held_out_FP,Held_out_FN,Held_out_AUC,Held_out_MCC,Held_out_AUCPR,Held_out_Specificity,Held_out_Sensitivity,BA
155,NCCT_QuantiLum_inhib_dn,0.722778,0.126442,6.0,3.0,6.0,6.0,0.324074,-0.166667,0.476993,0.333333,0.500000,0.416667
55,NVS_NR_hGR,0.447628,0.159549,15.0,0.0,5.0,1.0,0.618750,-0.125000,0.823696,0.000000,0.937500,0.468750
78,APR_HepG2_MitoMass_24h_dn,0.529304,0.101934,0.0,46.0,3.0,9.0,0.664399,-0.100093,0.242604,0.938776,0.000000,0.469388
91,OT_NURR1_NURR1RXRa_0480,0.591394,0.122119,0.0,71.0,3.0,12.0,0.746622,-0.076559,0.268119,0.959459,0.000000,0.479730
28,CEETOX_H295R_ANDR_dn,0.596118,0.107801,1.0,14.0,2.0,10.0,0.329545,-0.053300,0.349558,0.875000,0.090909,0.482955
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,OT_ER_ERaERa_1440,0.798622,0.105112,3.0,72.0,0.0,5.0,0.967014,0.592157,0.829657,1.000000,0.375000,0.687500
82,ATG_NRF2_ARE_CIS_up,0.694991,0.013168,30.0,74.0,12.0,26.0,0.769830,0.424239,0.626114,0.860465,0.535714,0.698090
178,TOX21_GR_BLA_Agonist_ch2,0.808838,0.058552,10.0,468.0,1.0,14.0,0.755730,0.603994,0.517021,0.997868,0.416667,0.707267
211,ATG_PXRE_CIS_up,0.718595,0.039401,43.0,64.0,9.0,26.0,0.840679,0.518617,0.843551,0.876712,0.623188,0.749950


In [24]:
df[df["Unnamed: 0"]=="ATG_C_EBP_CIS_up"]


Unnamed: 0.1,Unnamed: 0,CV_AUC_mean,CV_AUC_std,Held_out_TP,Held_out_TN,Held_out_FP,Held_out_FN,Held_out_AUC,Held_out_MCC,Held_out_AUCPR,Held_out_Specificity,Held_out_Sensitivity,BA
136,ATG_C_EBP_CIS_up,0.718569,0.087868,1.0,128.0,1.0,12.0,0.5322,0.169283,0.150225,0.992248,0.076923,0.534586
