In [26]:
import os
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, matthews_corrcoef, average_precision_score, confusion_matrix

def generate_fingerprints(smiles_list):
    fps = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr)
    return np.array(fps)

def evaluate_classifier(true_labels, predictions, probs):
    auc = roc_auc_score(true_labels, probs)
    mcc = matthews_corrcoef(true_labels, predictions)
    avg_precision = average_precision_score(true_labels, probs)
    tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()
    spe = tn / (tn + fp)
    sen = tp / (tp + fn)
    return {'AUC': auc, 'MCC': mcc, 'AUCPR': avg_precision, 'Specificity': spe, 'Sensitivity': sen}

def fold_error(true_values, predictions):
    ratio = predictions / true_values
    adjusted_ratio = np.where(ratio < 1, 1/ratio, ratio)
    return adjusted_ratio

def evaluate_regression(true_values, predictions):
    rmse = np.sqrt(mean_squared_error(true_values, predictions))
    r2 = np.corrcoef(true_values, predictions)[0, 1] ** 2
    ratio = predictions / true_values
    avg_fold_error = np.mean(fold_error(true_values, predictions))

    return {'R2': r2, 'RMSE': rmse, "average_fold_error": avg_fold_error}

# Path where your data is stored
data_path = '../data/processed_splits/'

results = {}

# Assuming PK dataset is regression and others are classification
for dataset in os.listdir(data_path):
    for activity in os.listdir(os.path.join(data_path, dataset)):
        train_df = pd.read_csv(os.path.join(data_path, dataset, f"{activity}_train.csv.gz"), compression='gzip')
        test_df = pd.read_csv(os.path.join(data_path, dataset, f"{activity}_test.csv.gz"), compression='gzip')

        X_train = generate_fingerprints(train_df['SMILES'])
        X_test = generate_fingerprints(test_df['SMILES'])

        y_train = train_df[activity]
        y_test = test_df[activity]

        if dataset == "PK_Lombardo":
            model = RandomForestRegressor()
            model.fit(X_train, y_train)
            predictions_train = model.predict(X_train)
            predictions_test = model.predict(X_test)

            cv_scores = cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5, scoring='r2')

            results[activity] = {
                'CV_R2_mean': np.mean(cv_scores),
                'CV_R2_std': np.std(cv_scores),
                **evaluate_regression(y_test, predictions_test)
            }
        else:
            model = RandomForestClassifier()
            model.fit(X_train, y_train)
            predictions_train = model.predict(X_train)
            predictions_test = model.predict(X_test)
            probs_test = model.predict_proba(X_test)[:, 1]

            cv_scores = cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5, scoring='roc_auc')

            results[activity] = {
                'CV_AUC_mean': np.mean(cv_scores),
                'CV_AUC_std': np.std(cv_scores),
                **evaluate_classifier(y_test, predictions_test, probs_test)
            }
            
        break
    break

# Save results
pd.DataFrame(results).T.to_csv('./structural_model_results.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed_splits/toxcast/BSK_BE3C_tPA_down_test.csv.gz_train.csv.gz'

In [8]:
import numpy as np



In [21]:
predictions  = [2,4,6]

In [22]:
true_values = [1,2,3]

In [23]:
true_values = np.array(true_values)
predictions = np.array(predictions)

In [24]:
avg_fold_error = np.mean(fold_error(true_values, predictions))

In [25]:
avg_fold_error

2.0