In [None]:
# Quick test: How long does it take to spawn workers?
from multiprocessing import Pool
import time

def dummy_task(x):
    """Minimal function to test worker spawn time"""
    return x * 2

print("Testing worker spawn time...")
start = time.time()
with Pool(4) as p:
    p.map(dummy_task, range(4))
end = time.time()
print(f"Time to spawn 4 workers with dummy task: {end-start:.2f} seconds")
print("If this takes >30 seconds, the issue is Python/conda environment, not your code")


Testing worker spawn time...


In [None]:
from multiprocessing import Pool
import sys
import os
from datetime import datetime

paralell_level = 4

def create_random_medium_cobra(expname:str):    
    """
    Simplified wrapper - all parameters hardcoded to reduce pickling
    Import heavy libraries only once per worker
    """
    
    # These imports happen once per worker process (not per function call)
    # But they're still inside the function to avoid module-level pickling
    import pandas as pd
    from Library.Build_Dataset_lite import TrainingSet
    
    # Hardcoded parameters
    cobraname = 'iML1515_duplicated_Lab_Data'
    mediumname = 'df_amn_dataset_levels'
    mediumbound = 'UB'
    exp_df_name = 'df_amn_dataset'
    method = 'pFBA'
    size_i = 100
    reduce = True
    verbose = True
    DIRECTORY = './'
    
    # Setup logging
    log_dir = './logs'
    os.makedirs(log_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
    log_file = f'{log_dir}/{expname}_{timestamp}.log'
    
    log_f = open(log_file, 'w', buffering=1)
    
    def log(message):
        log_f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {message}\n")
        log_f.flush()
    
    try:
        log(f"Starting processing for {expname}")
        
        # Get X from experimental data set
        cobrafile = DIRECTORY+'Dataset_input/'+cobraname
        exp_data_path = f"H:/ROBOT_SCIENTIST/E_coli/Growth_rates/2025-10-31-27/processed/no_replicates/{expname}/AMN_dataset/"
        expfile = exp_data_path + exp_df_name

        log(f"Reading experimental data from {expfile}")
        df_exp = pd.read_csv(expfile+".csv")
        mediumsize = len(df_exp.columns) - 1
        
        log(f"Creating TrainingSet with mediumsize={mediumsize}")
        parameter = TrainingSet(cobraname=cobrafile, 
                                mediumname=expfile, 
                                mediumbound=mediumbound, 
                                mediumsize=mediumsize, 
                                method='EXP',verbose=False)
        X = parameter.X.copy()
        log(f"X shape: {X.shape}")

        # Get other parameters from medium file
        mediumfile = exp_data_path + mediumname
        log(f"Reading medium file from {mediumfile}")
        parameter = TrainingSet(cobraname=cobrafile, 
                                mediumname=mediumfile, 
                                mediumbound=mediumbound, 
                                method=method, verbose=False)

        # Create varmed list
        log("Creating variable medium list")
        varmed = {}
        for i in range(X.shape[0]):
            varmed[i] = []
            for j in range(X.shape[1]):
                if parameter.levmed[j] > 1 and X[i,j] > 0:
                    varmed[i].append(parameter.medium[j])
        varmed = list(varmed.values())
        log(f"Variable medium created with {len(varmed)} entries")
        
        # Get COBRA training set
        log(f"Starting COBRA training set generation for {X.shape[0]} samples with size_i={size_i}")
        for i in range(X.shape[0]): 
            log(f"Processing sample {i+1}/{X.shape[0]}")
            parameter.get(sample_size=size_i, varmed=varmed[i], verbose=verbose, reduce=reduce) 
            log(f"Sample {i+1}/{X.shape[0]} completed")

        # Saving file
        trainingfile = DIRECTORY+'Dataset_model/'+expname+'_'+parameter.mediumbound
        log(f"Saving training file to {trainingfile}")
        parameter.save(trainingfile, reduce=reduce)
        log(f"Successfully completed processing for {expname}")
        
        log_f.close()
        return f"{expname}: SUCCESS"
        
    except Exception as e:
        import traceback
        log(f"ERROR processing {expname}: {str(e)}")
        log(traceback.format_exc())
        log_f.close()
        return f"{expname}: FAILED - {str(e)}"


# Main execution
if __name__ == '__main__' or 'ipykernel' in sys.modules:
    # Create logs directory
    log_dir = './logs'
    os.makedirs(log_dir, exist_ok=True)

    expnames = os.listdir('H:/ROBOT_SCIENTIST/E_coli/Growth_rates/2025-10-31-27/processed/no_replicates')

    print(f"Starting parallel processing of {len(expnames)} experiments with {paralell_level} workers")
    print(f"Log files will be created in {os.path.abspath(log_dir)}/ directory")
    print(f"Starting at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("Workers are being spawned...")
    print("(This may take 30-60 seconds for workers to load libraries)")
    sys.stdout.flush()

    # Use Pool without wrapper function
    with Pool(paralell_level) as p:
        results = p.map(create_random_medium_cobra, expnames)

    print(f"\nCompleted at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("\n=== Processing Summary ===")
    for result in results:
        print(result)
    print(f"Check individual log files in {os.path.abspath(log_dir)}/ for detailed progress")


Starting parallel processing of 13 experiments with 4 workers
Log files will be created in c:\Users\rh2310\projects\amn_release\logs/ directory
Starting at: 2025-12-03 16:17:29
Workers are being spawned and libraries loaded...


In [None]:

# Verifying
parameter = TrainingSet()
parameter.load(trainingfile)
print(trainingfile)
parameter.printout()

reduced numbers of metabolites and reactions: 1078 507
./Dataset_model/mediabotJLF1_UB
model file name: ./Dataset_model/mediabotJLF1_UB
reduced model: True
medium file name: H:/ROBOT_SCIENTIST/E_coli/Growth_rates/2025-10-31-27/processed/no_replicates/mediabotJLF1/AMN_dataset/df_amn_dataset_levels
medium bound: UB
list of reactions in objective: ['BIOMASS_Ec_iML1515_core_75p37M']
method: pFBA
trainingsize: 66
list of medium reactions: 34
list of medium levels: 34
list of medium values: 34
ratio of variable medium turned on: 0.0
list of measured reactions: 507
Stoichiometric matrix (1078, 507)
Boundary matrix from reactions to medium: (34, 507)
Measurement matrix from reaction to measures: (507, 507)
Reaction to metabolite matrix: (1078, 507)
Metabolite to reaction matrix: (507, 1078)
Training set X: (66, 34)
Training set Y: (66, 507)
S_int matrix (478, 507)
S_ext matrix (507, 2663)
Q matrix (507, 478)
P matrix (507, 507)
b_int vector (478,)
b_ext vector (66, 2663)
Sb matrix (507, 1078)


This cell has a completely different purpose than the rest of the notebook. It serves as a cell running Cobrapy with  provided values as inputs. These inputs are extracted from Reservoir Computing, you can see an example in the notebook `Build_Model_RC.ipynb`

In [None]:
# This cell run FBA on a provided training and compute R2 between
# provided objective and calculated objective
# R2 = 1 when the training set was generated by FBA, but may be different than 1
# when the training set is an experimental one
# For exprimental training set medium input fluxes can be scaled by a value

from sklearn.metrics import r2_score

# What you can change 
seed = 10
np.random.seed(seed=seed)  
cobraname = 'iML1515_EXP'  # name of the model 
mediumbound = 'UB' # a must, exact bounds unknown
mediumname = 'iML1515_EXP' # name of experimental file, for out-of-the-box FBA
# mediumname = 'iML1515_UB_AMN_QP_RC_AMN_solution_for_Cobra_train' # for running Cobra with RC training points as inputs
# mediumname = 'iML1515_UB_AMN_QP_RC_AMN_solution_for_Cobra_pred' # for running Cobra with RC predictions as inputs
method = 'EXP' # FBA, pFBA or EXP
# End of What you can change

# Get data
cobrafile =  DIRECTORY+'Dataset_input/'+cobraname
mediumfile = DIRECTORY+'Dataset_input/'+mediumname
parameter = TrainingSet(cobraname=cobrafile, 
                        mediumname=mediumfile, mediumbound=mediumbound, mediumsize=38, 
                        method=method,verbose=False)
#Â scaler_list = [2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0] # test different scalers
scaler_list = [2.5] # best scaler for out-of-the box FBA
# scaler_list = [1] # for running Cobra with RC training inputs, see mediumname

# regression cobra vs. true values
L = parameter.X.shape[0]
for scaler in scaler_list:
    Y = {}
    for i in range(L):
        inf = {r.id: 0 for r in parameter.model.reactions}
        for j in range(len(parameter.medium)):
            #print(j, parameter.medium[j],parameter.X[i,j], len(parameter.model.reactions))
            eps = 1.0e-4 if parameter.X[i,j] < 1.0e-4 else 0
            inf[parameter.medium[j]] = scaler * parameter.X[i,j] + eps
        out,Y[i] = run_cobra(parameter.model, parameter.objective, inf, method='pFBA', verbose=False)
        print("%d %.4f %.4f" % (i, parameter.Y[i], Y[i]))

    Y = list(Y.values())
    r2 = r2_score(parameter.Y[0:L], Y[0:L], multioutput='variance_weighted')
    print('scaler %.2f R2 %.4f ' % (scaler, r2))
# np.array(Y).tofile("Result/Cobra_alone.csv") # to uncomment if cobra alone saved in file

0 0.1696 0.1542
1 0.1340 0.1609
2 0.1886 0.2010
3 0.1990 0.1943
4 0.0720 0.1135
5 0.0924 0.1040
6 0.0881 0.1068
7 0.0900 0.1251
8 0.1989 0.2010
9 0.1054 0.1046
10 0.2681 0.2412
11 0.1576 0.1135
12 0.1209 0.1943
13 0.2729 0.2546
14 0.2945 0.2479
15 0.2386 0.2010
16 0.2531 0.1787
17 0.2606 0.2947
18 0.2816 0.1542
19 0.1351 0.1675
20 0.1449 0.1675
21 0.2409 0.2546
22 0.2437 0.2657
23 0.1059 0.1135
24 0.1082 0.2010
25 0.2451 0.3416
26 0.3099 0.2189
27 0.2000 0.2881
28 0.2077 0.1318
29 0.3837 0.3014
30 0.2247 0.2256
31 0.3520 0.2256
32 0.2255 0.1943
33 0.1340 0.2613
34 0.2397 0.3126
35 0.3654 0.3126
36 0.1863 0.2144
37 0.1612 0.2613
38 0.3442 0.2657
39 0.2964 0.3059
40 0.4135 0.2724
41 0.2561 0.1720
42 0.3949 0.3014
43 0.4205 0.2479
44 0.3050 0.3349
45 0.2315 0.1675
46 0.2708 0.2546
47 0.3351 0.3818
48 0.2785 0.3014
49 0.0765 0.2010
50 0.0704 0.1068
51 0.2095 0.2010
52 0.1135 0.1251
53 0.2193 0.2010
54 0.3316 0.2479
55 0.1368 0.1068
56 0.1362 0.1318
57 0.1074 0.1609
58 0.2277 0.1943
59 0.20

In [None]:
# This cell run FBA for P. putida model on a provided training and compute Accuracy between
# provided objective and calculated objective

from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# What you can change 
seed = 10
np.random.seed(seed=seed)  
cobraname = 'IJN1463_10_UB'  # name of the model 
mediumbound = 'UB' # a must, exact bounds unknown
mediumname = 'IJN1463_EXP' # for running Cobra with Exp file
mediumname = 'IJN1463_10_UB_AMN_QP_for_Cobra_train' # for running Cobra with RC file
method = 'EXP' # FBA, pFBA or EXP
L = 166 # split nitrogen (nh4) carbon (glucose)
# End of What you can change

# Get data
cobrafile =  DIRECTORY+'Dataset_input/'+cobraname
mediumfile = DIRECTORY+'Dataset_input/'+mediumname
parameter = TrainingSet(cobraname=cobrafile, 
                        mediumname=mediumfile,
                        mediumbound=mediumbound, mediumsize=196, 
                        method=method,verbose=False)

# Input medium are scaled by 10 for EXP file
scalerX = 1 if 'AMN' in mediumname else 10
Y = {}
for i in range(parameter.X.shape[0]):
    inf = {r.id: 0 for r in parameter.model.reactions}
    for j in range(len(parameter.medium)):
        eps = 1.0e-4 if parameter.X[i,j] < 1.0e-4 else 0
        inf[parameter.medium[j]] = scalerX * parameter.X[i,j] + eps
    try:
         _, Y[i] = run_cobra(parameter.model, parameter.objective, inf, method='FBA', verbose=False)
    except:
        _, Y[i] = 0, 0
    #print("%d %.0f %.4f" % (i, parameter.Y[i], Y[i]))
        

# Accuracies corrected with reactions not in the model
# TN: 23 (28) for C (N) total=51  (reaction not in the model and no grow)
# FN: 3 (1) for C (N) total=4 (reaction not in the model and but grow)
y_true = np.transpose(parameter.Y)[0] 
y_pred = np.asarray([1 if Y[i] > 0.01 else 0 for i in range(len(Y.values()))])
accall = accuracy_score(y_true, y_pred)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accall = (tp+tn+51)/(tn+51+fp+fn+4+tp)
tn, fp, fn, tp = confusion_matrix(y_true[:L], y_pred[:L]).ravel()
accnh4 = (tp+tn+28)/(tn+28+fp+fn+1+tp)       
tn, fp, fn, tp = confusion_matrix(y_true[L:], y_pred[L:]).ravel()
accglu = (tp+tn+23)/(tn+23+fp+fn+3+tp)       
print('Acc %.4f %.4f %.4f' % (accall, accnh4, accglu))

Acc 0.9597 0.9538 0.9709
