In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, precision_recall_curve, accuracy_score, auc
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import ADASYN
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
pd.set_option("display.max_rows", None)

In [38]:
metadata = pd.read_csv("metadata.csv",header=0,index_col=0)
microbiome = pd.read_csv("microbiome.csv",header=0,index_col=0)
metabolome = pd.read_csv("serum_lipo.csv",header=0,index_col=0)

In [None]:
microbiome_train, microbiome_test, diseases_train, diseases_test = train_test_split(microbiome, metadata[['PATGROUPFINAL_C']], train_size=0.8)
train_idx = microbiome_train.index
test_idx  = microbiome_test.index
metabolome_train = metabolome.loc[train_idx]
metabolome_test  = metabolome.loc[test_idx]
microbiome_and_metabolome_train = pd.merge(metabolome_train, microbiome_train, left_index=True, right_index=True, how='inner')
microbiome_and_metabolome_test = pd.merge(metabolome_test, microbiome_test, left_index=True, right_index=True, how='inner')
metadata['Disease_status'] = metadata['PATGROUPFINAL_C'].apply(lambda x:0 if x == '8' else 1)
metadata['T2D'] = metadata['PATGROUPFINAL_C'].apply(lambda x:1 if x in diseases_to_categories['Type_2_Diabetes'] else 0)
metadata['CAD'] = metadata['PATGROUPFINAL_C'].apply(lambda x:1 if x in diseases_to_categories['Coronary_Artery_Disease'] else 0)
metadata['HF'] = metadata['PATGROUPFINAL_C'].apply(lambda x:1 if x in diseases_to_categories['Heart_Failure'] else 0)
metadata['SO'] = metadata['PATGROUPFINAL_C'].apply(lambda x:1 if x in diseases_to_categories['Severe_Obesity'] else 0)
metadata['MS'] = metadata['PATGROUPFINAL_C'].apply(lambda x:1 if x in diseases_to_categories['Metabolic_Syndrome'] else 0)
metadata_train = metadata.loc[train_idx]
metadata_test = metadata.loc[test_idx]
labels_train = metadata_train['Disease_status']
labels_test_all_diseases = metadata_test['Disease_status']
labels_train_T2D = metadata_train['T2D']
labels_test_T2D = metadata_test['T2D']
labels_train_CAD = metadata_train['CAD']
labels_test_CAD = metadata_test['CAD']
labels_train_HF = metadata_train['HF']
labels_test_HF = metadata_test['HF']
labels_train_SO = metadata_train['SO']
labels_test_SO = metadata_test['SO']
labels_train_MS = metadata_train['MS']
labels_test_MS = metadata_test['MS']

In [None]:
def fix_imbalance(train_data, train_labels):
    sick_label = True
    healthy_label = False
    scale_factor = 1.75
    # Target: 90% healthy, 10% sick
    n_total = int(len(train_labels)*scale_factor)
    n_healthy_target = int(0.9 * n_total)
    n_sick_target = n_total - n_healthy_target
    print("Before under/over sampling:", Counter(train_labels))

    # Step 1: NearMiss v1 undersamples sick to its target count
    nm = NearMiss(version=1, n_neighbors=3, sampling_strategy={sick_label: n_sick_target})
    # X_nm, y_nm = nm.fit_resample(train_data, train_labels)
    # nm_indices = nm.sample_indices_
    # Step 2: ADASYN oversamples healthy to its target count
    adasyn = ADASYN(sampling_strategy={healthy_label: n_healthy_target}, n_neighbors=5)
    # train_data_balance, train_labels_balance  = adasyn.fit_resample(X_nm, y_nm)
    # adasyn_indices = adasyn.sample_indices_
    pipeline = Pipeline([('nearmiss', nm),('adasyn', adasyn)])

    train_data_balance, train_labels_balance = pipeline.fit_resample(train_data, train_labels)
    print("After under/over sampling:", Counter(train_labels_balance))
    return train_data_balance, train_labels_balance, train_labels

In [None]:
microbiome_train_balance, microbiome_labels_train_balance= fix_imbalance(microbiome_train, labels_train)
# microbiome_metabolome_train_balance, labels_train_balance = fix_imbalance(microbiome_and_metabolome_train, labels_train)
# microbiome_test_balance, microbiome_labels_test_balance_all_diseases = fix_imbalance(microbiome_test, labels_test_all_diseases)
# microbiome_metabolome_test_balance, labels_test_balance_all_diseases = fix_imbalance(microbiome_and_metabolome_test, labels_test_all_diseases)

Before under/over sampling: Counter({1: 964, 0: 142})


AttributeError: 'ADASYN' object has no attribute 'sample_indices_'

In [22]:
def run_random_forest(train_data, train_labels, test_data, test_labels, plot_title):
    model = RandomForestClassifier()
    model.fit(train_data,train_labels)
    importances = model.feature_importances_
    gini_df = pd.DataFrame({'Feature': train_data.columns,'Gini Importance': importances}).sort_values(by='Gini Importance', ascending=False)
    labels_probs = model.predict_proba(test_data)[:, 1]
    labels_pred = model.predict(test_data)
    correctness = labels_pred == test_labels
    confidence_df = pd.DataFrame({'Sample': test_data.index,'Confidence score': labels_probs, 'Correct':correctness}).sort_values(by='Confidence score', ascending=False)
    precision, recall, thresholds = precision_recall_curve(test_labels, labels_probs)
    aupr = auc(recall, precision)
    # aupr = average_precision_score(test_labels, labels_probs)
    baseline = sum(test_labels) / len(test_labels)
    plt.figure(figsize=(8, 5))
    plt.plot(recall, precision, label=f'AUPR = {aupr:.4f}')
    plt.hlines(baseline, 0, 1, colors='r', linestyles='dashed', label=f'Baseline = {baseline:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve '+plot_title)
    plt.legend()
    plt.tight_layout()
    plt.show()
    return confidence_df, gini_df

In [None]:
microbiome_only_res = run_random_forest(microbiome_train_balance, microbiome_train_labels_balance, microbiome_test_balance, microbiome_labels_test_balance, 'any disease, microbiome data only')
microbiome_metabolome_res = run_random_forest(microbiome_metabolome_train_balance, train_labels_balance, microbiome_metabolome_test_balance, labels_test_balance, 'any disease, microbiome and metabolome data')

In [66]:
def run_feature_selection(train_data, train_labels, test_data):
    model = lgb.LGBMClassifier()
    model.fit(train_data,train_labels)
    selector = SelectFromModel(model, prefit=True)
    train_data_sel = selector.transform(train_data)
    test_data_sel = selector.transform(test_data)
    selected_features = train_data.columns[selector.get_support()]
    print(selected_features)
    importances = model.feature_importances_
    importance_df = pd.DataFrame({'Feature': train_data.columns,'Importance': importances}).sort_values(by='Importance', ascending=False)
    return importance_df
    # labels_probs = model.predict_proba(test_data)[:, 1]
    # labels_pred = model.predict(test_data)
    # correctness = labels_pred == test_labels
    # confidence_df = pd.DataFrame({'Sample': test_data.index,'Confidence score': labels_probs, 'Correct':correctness}).sort_values(by='Confidence score', ascending=False)
    # precision, recall, thresholds = precision_recall_curve(test_labels, labels_probs)
    # aupr = auc(recall, precision)
    # # aupr = average_precision_score(test_labels, labels_probs)
    # baseline = sum(test_labels) / len(test_labels)
    # plt.figure(figsize=(8, 5))
    # plt.plot(recall, precision, label=f'AUPR = {aupr:.4f}')
    # plt.hlines(baseline, 0, 1, colors='r', linestyles='dashed', label=f'Baseline = {baseline:.4f}')
    # plt.xlabel('Recall')
    # plt.ylabel('Precision')
    # plt.title('Precision-Recall Curve '+plot_title)
    # plt.legend()
    # plt.tight_layout()
    # plt.show()
    # return confidence_df, gini_df

In [None]:
def balance_and_run_lgbm_per_disease(train_labels, test_labels):
    microbiome_train_balance, microbiome_train_labels_balance = fix_imbalance(microbiome_train, train_labels)
    microbiome_metabolome_train_balance, train_labels_balance = fix_imbalance(microbiome_and_metabolome_train, train_labels)
    microbiome_test_balance, microbiome_labels_test_balance = fix_imbalance(microbiome_test, test_labels)
    microbiome_metabolome_test_balance, labels_test_balance = fix_imbalance(microbiome_and_metabolome_test, test_labels)
    return run_feature_selection(microbiome_metabolome_train_balance, train_labels_balance, microbiome_metabolome_test_balance)

In [62]:
microbiome_and_metabolome_train.shape

(1106, 769)

In [65]:
run_feature_selection(microbiome_and_metabolome_train, labels_train_T2D, microbiome_and_metabolome_test)

[LightGBM] [Info] Number of positive: 346, number of negative: 760
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 191492
[LightGBM] [Info] Number of data points in the train set: 1106, number of used features: 769
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.312839 -> initscore=-0.786880
[LightGBM] [Info] Start training from score -0.786880
Index(['H1A1', 'H2TG', 'H3FC', 'H3PL', 'H3TG', 'H4A2', 'H4FC', 'H4PL', 'H4TG',
       'HDTG',
       ...
       'Prevotella sp003447235', 'Duodenibacillus intestinigallinarum',
       'Sutterella wadsworthensis', 'Acidaminococcus intestini',
       'Cryptobacteroides sp000433355',
       'Phascolarctobacterium_A succinatutens_A', 'Megasphaera elsdenii',
       'Prevotella sp002297965', 'Dialister sp002320515',
       'Allisonella histaminiformans'],
      dtype='object', length=306)




Unnamed: 0,Feature,Importance
465,Intestinibacter sp900540355,57
722,Acidaminococcus fermentans,40
394,RUG115 sp900066395,34
656,Klebsiella pneumoniae,32
653,Escherichia marmotae,25
271,Agathobaculum butyriciproducens,25
680,Faecalibacillus faecis,25
541,Phocaeicola sp000436795,25
369,Copromonas sp900066055,20
392,COE1 sp001916965,19


In [None]:
balance_and_run_rf_for_disease(labels_train_CAD,labels_test_CAD)

In [36]:
x = 0.8
larger_count = (confidence["Confidence score"] >= x).sum()
smaller_count = (confidence["Confidence score"] < x).sum()
print("larger: ",larger_count)
print("smaller: ",smaller_count)
large_confidence_and_correct = ((confidence["Confidence score"] > x) & (confidence["Correct"] == True)).sum()
large_confidence_and_correct

larger:  9
smaller:  484


np.int64(7)

In [40]:
diseases_to_categories = {'Metabolic_Syndrome':['1'],'Type_2_Diabetes':['3'],'Coronary_Artery_Disease':['4','5','6'],'Severe_Obesity':['2a','2b'],'Heart_Failure':['6','7'],'Healthy':['8']}