In [1]:
import sys
sys.path.append('..')

In [2]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())
print(torch.cuda.get_device_name(device))

Available devices  4
Current cuda device  0
NVIDIA A100-PCIE-40GB


In [3]:
! nvidia-smi

Fri Dec 29 13:54:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.223.02   Driver Version: 470.223.02   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  Off  | 00000000:01:00.0 Off |                    0 |
| N/A   39C    P0    38W / 250W |  38629MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  Off  | 00000000:24:00.0 Off |                    0 |
| N/A   39C    P0    37W / 250W |   4533MiB / 40536MiB |      0%      Default |
|       

In [4]:
GPU_NUM = 3 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print ('Current cuda device ', torch.cuda.current_device()) # check

# Additional Infos
if device.type == 'cuda':
    print(torch.cuda.get_device_name(GPU_NUM))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(GPU_NUM)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(GPU_NUM)/1024**3,1), 'GB')
    
print ('Current cuda device ', torch.cuda.current_device())

Current cuda device  3
NVIDIA A100-PCIE-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
Current cuda device  3




In [5]:
print ('Current cuda device ', torch.cuda.current_device())

Current cuda device  3


global_settings

In [6]:
from logics_pack_original import global_settings, chemistry, predictor
import pandas as pd
import numpy as np
import json
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [11]:
TEST_FOLD_IDX = 5

KOR_ACT_THRS = 7.0  # threshold for being KOR active
PIK3CA_ACT_THRS = 8.0  # threshold for being PIK3CA active

logics_paths = {
    "EXPERIMENT_SETTINGS_JSON": "logics_pack/experiment_settings.json",
    ### following files will be generated through notes in pre-training phase
    ## initial data
    "PIK3CA_DATA_PATH": "data/pik3ca/pik3ca_affinity_new.csv",
    "PIK3CA_FOLD_JSON": "data/pik3ca/pik3ca_fold_splits.json",
    "PIK3CA_DATA_FP": "data/pik3ca/pik3ca_aff_npfps.npy",
}
project_dir='../'
for key,path in logics_paths.items():
    logics_paths[key] = project_dir + path
logics_paths["PROJECT_DIR"] = project_dir

In [12]:
expset_obj = global_settings.ExperimentSettings(logics_paths['EXPERIMENT_SETTINGS_JSON'])

Training PIK3CA activity predictor (Random Forest Regressor)

In [None]:
affinity_df = pd.read_csv(logics_paths['PIK3CA_DATA_PATH'])
pred_labels = np.array(affinity_df['affinity'])
fp_features = np.load(logics_paths['PIK3CA_DATA_FP'])

with open(logics_paths['PIK3CA_FOLD_JSON'], 'r') as f:
    fold_split = json.load(f)

test_fold_id=str(TEST_FOLD_IDX)
tf_ids = np.array(fold_split[test_fold_id]) # test fold data indices

vf_keys = list(fold_split.keys())
vf_keys.remove(test_fold_id)
print(vf_keys)

rfr_fold = []
vmse = [] # validation mse
vr2 = [] # validation r2
for i, key in enumerate(vf_keys):
    # i-th fold is used as a validation set.
    vf_ids = np.array(fold_split[key]) # validation fold
    v_labels = pred_labels[vf_ids]
    v_features = fp_features[vf_ids]

    nt_ids = np.append(tf_ids, vf_ids) # non-training indices
    tr_labels = np.delete(pred_labels.copy(), nt_ids, axis=0)
    tr_features = np.delete(fp_features.copy(), nt_ids, axis=0)

    # train RFR in a default setup
    rfr = RandomForestRegressor(n_estimators=100) # n_estimators=100 is default for sklearn version>=0.22
    rfr.fit(tr_features, tr_labels)
    rfr_fold.append(rfr)

    # validation performance
    v_preds = rfr.predict(v_features)
    vmse.append(mean_squared_error(v_labels, v_preds))
    vr2.append(r2_score(v_labels, v_preds))

['0', '1', '2', '3', '4']


In [None]:
# RFR regressor training
rfr_cvs=rfr_fold
cv_fold_keys=vf_keys

In [None]:
# save the result
cv_folds = [int(key) for key in cv_fold_keys]
pred_result = pd.DataFrame(cv_folds, columns=['cv_fold'])
pred_result['vmse'] = vmse
pred_result['vr2'] = vr2
pred_result.to_csv(logics_paths['PROJECT_DIR'] + "model-pretrain/predictor_pik3ca_rfr_cv_results.csv", index=False)

In [None]:
# find the best performing cv fold by validation R2
best_cv_idx = pred_result['vr2'].idxmax()
best_cv = pred_result['cv_fold'].iloc[best_cv_idx]
# add best cv info to the experiment setting json file, and overwrite it
expset_obj.update_setting("pik3ca-pred-best-cv", best_cv)
print("best CV fold of PIK3CA predictor: ", best_cv)

In [None]:
# save the models
for i, cv_idx in enumerate(cv_folds):
    with open(logics_paths['PROJECT_DIR'] + "model-pretrain/predictor_pik3ca_rfr_cv%d.pkl"%cv_idx, 'wb') as f:
        pickle.dump(rfr_cvs[i], f)