In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler
from flaml import AutoML
from lightgbm import LGBMClassifier
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef, recall_score, precision_score, f1_score, accuracy_score

In [3]:
BASE_PATH = 'data/ROSMAP/'
methy_path = os.path.join(BASE_PATH,'methy.csv')
mirna_path = os.path.join(BASE_PATH,'mirna.csv')
mrna_path = os.path.join(BASE_PATH,'mrna.csv')
if not os.path.exists(methy_path) or not os.path.exists(mirna_path) or not os.path.exists(mrna_path):
    raise Exception('File not exists!')

In [4]:
methy_df = pd.read_csv(methy_path, index_col=0)
mirna_df = pd.read_csv(mirna_path, index_col=0)
mrna_df = pd.read_csv(mrna_path, index_col=0)
print(methy_df.shape, mirna_df.shape, mrna_df.shape)

(351, 202) (351, 202) (351, 202)


In [5]:
methy_df_wol = methy_df.drop('Label', axis=1)
mirna_df_wol = mirna_df.drop('Label', axis=1)
mrna_df_wol = mrna_df.drop('Label', axis=1)
methy_df_wos = methy_df_wol.drop('Split', axis=1)
mirna_df_wos = mirna_df_wol.drop('Split', axis=1)
mrna_df_wos = mrna_df_wol.drop('Split', axis=1)

combined_df = pd.concat([methy_df_wos, mirna_df_wos, mrna_df_wos], axis=1)
combined_df['Label'] = methy_df['Label']
combined_df['Split'] = methy_df['Split']

In [6]:
def get_train_test(df):
    train_df = df[df['Split'] == 1].drop('Split', axis=1)
    test_df = df[df['Split'] == 0].drop('Split', axis=1)
    y_train = train_df.pop('Label')
    y_test = test_df.pop('Label')
    return train_df, test_df, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = get_train_test(combined_df)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# lgbm = LGBMClassifier(learning_rate=0.2182359708539136, max_bin=127,
#                min_child_samples=17, n_estimators=1, n_jobs=-1, num_leaves=4,
#                reg_alpha=0.8016945370256343, reg_lambda=0.01772710330912467,
#                verbose=-1)

In [10]:
# lgbm.fit(X_train_scaled, y_train)
# y_pred = lgbm.predict(X_test_scaled)
# print('-----------------LGBM-----------------')
# print('cohen_kappa_score:', cohen_kappa_score(y_test, y_pred))
# print('matthews_corrcoef:', matthews_corrcoef(y_test, y_pred))
# print('recall_score:', recall_score(y_test, y_pred))
# print('precision_score:', precision_score(y_test, y_pred))
# print('f1_score:', f1_score(y_test, y_pred))
# print('accuracy_score:', accuracy_score(y_test, y_pred))


-----------------LGBM-----------------
cohen_kappa_score: 0.3970849626733025
matthews_corrcoef: 0.3982174688057041
recall_score: 0.6727272727272727
precision_score: 0.7254901960784313
f1_score: 0.6981132075471698
accuracy_score: 0.6981132075471698


In [11]:
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    "time_budget": 1*60*60,  # in seconds
    "metric": 'accuracy',
    "estimator_list": ['lgbm'],
    "task": 'classification',
    "log_file_name": "test.log",
}

automl.fit(X_train=X_train_scaled, y_train=y_train,
           **automl_settings)
# Predict
print(automl.predict_proba(X_test_scaled))
# Print the best model
print(automl.model.estimator)

[flaml.automl.logger: 01-17 11:33:57] {1679} INFO - task = classification
[flaml.automl.logger: 01-17 11:33:57] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 01-17 11:33:57] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 01-17 11:33:57] {1900} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 01-17 11:33:57] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-17 11:33:58] {2344} INFO - Estimated sufficient time budget=6204s. Estimated necessary time budget=6s.
[flaml.automl.logger: 01-17 11:33:58] {2391} INFO -  at 0.6s,	estimator lgbm's best error=0.3918,	best estimator lgbm's best error=0.3918
[flaml.automl.logger: 01-17 11:33:58] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-17 11:33:59] {2391} INFO -  at 1.2s,	estimator lgbm's best error=0.3918,	best estimator lgbm's best error=0.3918
[flaml.automl.logger: 01-17 11:33:59] {2218} INFO - iteration 2, current learner lgbm
[flaml

In [12]:
y_pred = automl.predict(X_test_scaled)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('F1 score: ', f1_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))

Accuracy:  0.7358490566037735
F1 score:  0.7358490566037736
Recall:  0.7090909090909091
Precision:  0.7647058823529411
