In [1]:
import pandas as pd 
import numpy as np 
from flaml import AutoML
import os

In [2]:
base_path = "data/ROSMAP"
methy_path = os.path.join(base_path, "methy.csv")
mirna_path = os.path.join(base_path, "mirna.csv")
mrna_path = os.path.join(base_path, "mrna.csv")
if not os.path.exists(methy_path) or not os.path.exists(mirna_path) or not os.path.exists(mrna_path):
    raise ValueError("Data Not found")

In [3]:
methy_df = pd.read_csv(methy_path, index_col=0)
mirna_df = pd.read_csv(mirna_path, index_col=0)
mrna_df = pd.read_csv(mrna_path, index_col=0)
print(methy_df.shape, mirna_df.shape, mrna_df.shape)

(351, 202) (351, 202) (351, 202)


In [15]:
methy_df.head()

Unnamed: 0,ENSG00000161896.6,ENSG00000248714.2,ENSG00000242349.1,ENSG00000006831.9,ENSG00000261195.1,ENSG00000143409.11,ENSG00000140931.15,ENSG00000185499.11,ENSG00000101265.10,ENSG00000153048.6,...,ENSG00000168209.3,ENSG00000203565.2,ENSG00000186635.10,ENSG00000157978.6,ENSG00000138207.7,ENSG00000236609.3,ENSG00000136451.4,ENSG00000158715.5,Label,Split
1,0.181381,0.125337,0.094807,0.480227,0.246111,0.370208,0.214085,0.230799,0.42763,0.518057,...,0.64037,0.0,0.491023,0.256043,0.492993,0.405093,0.322018,0.213731,0.0,1.0
2,0.132848,0.099462,0.149573,0.487058,0.37794,0.358607,0.300085,0.293175,0.400915,0.587203,...,0.688326,0.117427,0.511048,0.282617,0.371442,0.394418,0.331158,0.216541,1.0,1.0
3,0.111206,0.069999,0.127883,0.412276,0.198039,0.289383,0.200385,0.149573,0.374727,0.434148,...,0.4579,0.0,0.452789,0.202694,0.550137,0.398473,0.303811,0.141733,1.0,1.0
4,0.085876,0.129139,0.071849,0.473626,0.23143,0.393466,0.184434,0.181821,0.411244,0.489082,...,0.728601,0.0,0.495581,0.230799,0.440702,0.413301,0.3492,0.176454,0.0,1.0
5,0.068125,0.121434,0.008678,0.453002,0.150118,0.340528,0.201158,0.231745,0.390683,0.506201,...,0.594668,0.0,0.464628,0.239995,0.566704,0.374245,0.257372,0.222676,0.0,1.0


In [33]:
def get_train_test(df):
    train_df = df[df["Split"]==1].drop("Split", axis=1)
    test_df = df[df["Split"]==0].drop("Split", axis=1)
    y_train = train_df.pop("Label")
    y_test = test_df.pop("Label")
    return (train_df.values, y_train.values.astype("int32")), (test_df.values, y_test.values.astype("int32"))

train_data_methy, test_data_methy = get_train_test(methy_df)

In [46]:
def automl_predict(train_data, test_data, modality_type, time_budget = int(1*60)):
    automl = AutoML()

    automl_settings = {
    "time_budget": time_budget,  # in seconds
    "metric": 'accuracy',
    "task": 'classification',
    "log_file_name": f"automl_logs/{modality_type}.log",
    }
    automl.fit(X_train=train_data[0], y_train=train_data[1],
        **automl_settings)
    predictions = automl.predict(test_data[0])
    accuracy = np.where(predictions == test_data[1], 1, 0).sum()/len(predictions)
    estimator = automl.model.estimator
    return estimator, accuracy


In [47]:
train_data_methy, test_data_methy = get_train_test(methy_df)
train_data_mirna, test_data_mirna = get_train_test(mirna_df)
train_data_mrna, test_data_mrna = get_train_test(mrna_df)

In [50]:
methy_estimator, methy_accuracy = automl_predict(train_data_methy, test_data_methy, "methy")
print(methy_accuracy)
print(methy_estimator)

[flaml.automl.logger: 12-15 21:39:43] {1679} INFO - task = classification
[flaml.automl.logger: 12-15 21:39:43] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 12-15 21:39:43] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 12-15 21:39:43] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 12-15 21:39:43] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-15 21:39:43] {2344} INFO - Estimated sufficient time budget=1896s. Estimated necessary time budget=44s.
[flaml.automl.logger: 12-15 21:39:43] {2391} INFO -  at 0.2s,	estimator lgbm's best error=0.3510,	best estimator lgbm's best error=0.3510
[flaml.automl.logger: 12-15 21:39:43] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-15 21:39:43] {2391} INFO -  at 0.3s,	estimator lgbm's best error=0.3510,	best estimator lgbm's best error=0.3510
[flaml.automl.logger: 12-15 21:39:



In [51]:
mirna_estimator, mirna_accuracy = automl_predict(train_data_mirna, test_data_mirna, "mirna")
print(mirna_accuracy)
print(mirna_estimator)

[flaml.automl.logger: 12-15 21:40:43] {1679} INFO - task = classification
[flaml.automl.logger: 12-15 21:40:43] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 12-15 21:40:43] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 12-15 21:40:43] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 12-15 21:40:43] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-15 21:40:43] {2344} INFO - Estimated sufficient time budget=1108s. Estimated necessary time budget=26s.
[flaml.automl.logger: 12-15 21:40:43] {2391} INFO -  at 0.1s,	estimator lgbm's best error=0.4612,	best estimator lgbm's best error=0.4612
[flaml.automl.logger: 12-15 21:40:43] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-15 21:40:43] {2391} INFO -  at 0.2s,	estimator lgbm's best error=0.4367,	best estimator lgbm's best error=0.4367
[flaml.automl.logger: 12-15 21:40:



In [52]:
mirna_estimator, mirna_accuracy = automl_predict(train_data_mirna, test_data_mirna, "mirna")
print(mirna_accuracy)
print(mirna_estimator)

[flaml.automl.logger: 12-15 21:41:43] {1679} INFO - task = classification
[flaml.automl.logger: 12-15 21:41:43] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 12-15 21:41:43] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 12-15 21:41:43] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 12-15 21:41:43] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-15 21:41:43] {2344} INFO - Estimated sufficient time budget=1000s. Estimated necessary time budget=23s.
[flaml.automl.logger: 12-15 21:41:43] {2391} INFO -  at 0.1s,	estimator lgbm's best error=0.4612,	best estimator lgbm's best error=0.4612
[flaml.automl.logger: 12-15 21:41:43] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-15 21:41:44] {2391} INFO -  at 0.2s,	estimator lgbm's best error=0.4367,	best estimator lgbm's best error=0.4367
[flaml.automl.logger: 12-15 21:41:



[flaml.automl.logger: 12-15 21:42:44] {2627} INFO - retrain xgb_limitdepth for 0.3s
[flaml.automl.logger: 12-15 21:42:44] {2630} INFO - retrained model: XGBClassifier(base_score=None, booster=None, callbacks=[],
              colsample_bylevel=0.8811171114303163, colsample_bynode=None,
              colsample_bytree=0.8499027725496043, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=1.0, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=0.5338087998905253, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=14,
              n_jobs=-1, num_parallel_tree=None, random_state=None, ...)
[flaml.automl.logger: 12-15 21:42:44] {1930

In [73]:
def automl_predict(train_data, test_data, modality_type, time_budget = int(1)):
    automl = AutoML()

    automl_settings = {
    "time_budget": time_budget,  # in seconds
    "metric": 'accuracy',
    "task": 'classification',
    "log_file_name": f"automl_logs/{modality_type}.log",
    "seed":1000
    }
    automl.fit(X_train=train_data[0], y_train=train_data[1],
        **automl_settings)
    predictions = automl.predict(test_data[0])
    accuracy = np.where(predictions == test_data[1], 1, 0).sum()/len(predictions)
    print(f"Accuracy for Modality: {modality_type} is {accuracy}")
    return predictions

In [74]:
predictions_arr = []
methy_predictions = automl_predict(train_data_methy, test_data_methy, "methy")
predictions_arr.append(methy_predictions)
mirna_predictions = automl_predict(train_data_mirna, test_data_mirna, "mirna")
predictions_arr.append(mirna_predictions)
mrna_predictions = automl_predict(train_data_mrna, test_data_mrna, "mrna")
predictions_arr.append(mrna_predictions)


[flaml.automl.logger: 12-15 22:00:46] {1679} INFO - task = classification
[flaml.automl.logger: 12-15 22:00:46] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 12-15 22:00:46] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 12-15 22:00:46] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 12-15 22:00:46] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-15 22:00:47] {2344} INFO - Estimated sufficient time budget=935s. Estimated necessary time budget=22s.
[flaml.automl.logger: 12-15 22:00:47] {2391} INFO -  at 0.1s,	estimator lgbm's best error=0.2593,	best estimator lgbm's best error=0.2593
[flaml.automl.logger: 12-15 22:00:47] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-15 22:00:47] {2391} INFO -  at 0.1s,	estimator lgbm's best error=0.2593,	best estimator lgbm's best error=0.2593
[flaml.automl.logger: 12-15 22

In [75]:
predictions_arr = np.array(predictions_arr)
print(predictions_arr.shape)

(3, 106)


In [76]:
def ensemble_predictions(predictions_arr):
    predictions = np.mean(predictions_arr, axis=0)
    predictions = np.where(predictions > 0.5, 1, 0)
    return predictions

In [77]:
ensemble_predictions = ensemble_predictions(predictions_arr)
print(ensemble_predictions.shape)

(106,)


In [78]:
print(np.where(ensemble_predictions == test_data_methy[1], 1, 0).sum()/len(ensemble_predictions))

0.7358490566037735
