In [1]:
import csv
import collections
import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.common.utils.utils import setup_outputdir
from autogluon.core.utils.loaders import load_pkl
from autogluon.core.utils.savers import save_pkl
import os.path
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
import seaborn as sns
from random import sample

In [2]:
class MultilabelPredictor():
    """ Tabular Predictor for predicting multiple columns in table.
        Creates multiple TabularPredictor objects which you can also use individually.
        You can access the TabularPredictor for a particular label via: `multilabel_predictor.get_predictor(label_i)`

        Parameters
        ----------
        labels : List[str]
            The ith element of this list is the column (i.e. `label`) predicted by the ith TabularPredictor stored in this object.
        path : str, default = None
            Path to directory where models and intermediate outputs should be saved.
            If unspecified, a time-stamped folder called "AutogluonModels/ag-[TIMESTAMP]" will be created in the working directory to store all models.
            Note: To call `fit()` twice and save all results of each fit, you must specify different `path` locations or don't specify `path` at all.
            Otherwise files from first `fit()` will be overwritten by second `fit()`.
            Caution: when predicting many labels, this directory may grow large as it needs to store many TabularPredictors.
        problem_types : List[str], default = None
            The ith element is the `problem_type` for the ith TabularPredictor stored in this object.
        eval_metrics : List[str], default = None
            The ith element is the `eval_metric` for the ith TabularPredictor stored in this object.
        consider_labels_correlation : bool, default = True
            Whether the predictions of multiple labels should account for label correlations or predict each label independently of the others.
            If True, the ordering of `labels` may affect resulting accuracy as each label is predicted conditional on the previous labels appearing earlier in this list (i.e. in an auto-regressive fashion).
            Set to False if during inference you may want to individually use just the ith TabularPredictor without predicting all the other labels.
        kwargs :
            Arguments passed into the initialization of each TabularPredictor.

    """

    multi_predictor_file = 'multilabel_predictor.pkl'

    def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=False, **kwargs):
        if (problem_types is not None) and (len(problem_types) != len(labels)):
            raise ValueError("If provided, `problem_types` must have same length as `labels`")
        if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):
            raise ValueError("If provided, `eval_metrics` must have same length as `labels`")
        self.path = setup_outputdir(path, warn_if_exist=False)
        self.labels = labels
        self.consider_labels_correlation = consider_labels_correlation
        self.predictors = {}  # key = label, value = TabularPredictor or str path to the TabularPredictor for this label
        if eval_metrics is None:
            self.eval_metrics = {}
        else:
            self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}
        problem_type = None
        eval_metric = None
        for i in range(len(labels)):
            label = labels[i]
            path_i = self.path + "Predictor_" + label
            if problem_types is not None:
                problem_type = problem_types[i]
            if eval_metrics is not None:
                eval_metric = eval_metrics[i]
            self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)

    def fit(self, train_data, tuning_data=None, final = False, **kwargs):
        """ Fits a separate TabularPredictor to predict each of the labels.

            Parameters
            ----------
            train_data, tuning_data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                See documentation for `TabularPredictor.fit()`.
            kwargs :
                Arguments passed into the `fit()` call for each TabularPredictor.
        """
        if isinstance(train_data, str):
            train_data = TabularDataset(train_data)
        if tuning_data is not None and isinstance(tuning_data, str):
            tuning_data = TabularDataset(tuning_data)
        train_data_og = train_data.copy()
        if tuning_data is not None:
            tuning_data_og = tuning_data.copy()
        else:
            tuning_data_og = None
        save_metrics = len(self.eval_metrics) == 0
        start = time.time()
        for i in range(len(self.labels)):
            label = self.labels[i]
            predictor = self.get_predictor(label)
            if not self.consider_labels_correlation:
                labels_to_drop = [l for l in self.labels if l != label]
            else:
                labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]
            train_data = train_data_og.drop(labels_to_drop, axis=1)
            if tuning_data is not None:
                tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)
            print(f"Fitting TabularPredictor for label: {label} ...{i / len(self.labels) * 100}%")
            print(f"{(time.time() - start) / 60} minutes")
            if (final):
                predictor.fit(train_data=train_data[train_data[label] > float('-inf')]
                              , tuning_data = tuning_data
                              ,presets = 'best_quality'
                              ,num_bag_folds = 5,num_bag_sets = 2
                              , **kwargs)
            else:
                predictor.fit(train_data=train_data[train_data[label] > float('-inf')]
                              , tuning_data = tuning_data
                              ,presets = 'medium_quality'
                              #,presets = 'best_quality'
                              #,num_bag_folds = 5,num_bag_sets = 2
                              , **kwargs)
            self.predictors[label] = predictor.path
            if save_metrics:
                self.eval_metrics[label] = predictor.eval_metric
        self.save()

    def predict(self, data, **kwargs):
        """ Returns DataFrame with label columns containing predictions for each label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. If label columns are present in this data, they will be ignored. See documentation for `TabularPredictor.predict()`.
            kwargs :
                Arguments passed into the predict() call for each TabularPredictor.
        """
        return self._predict(data, as_proba=False, **kwargs)

    def predict_proba(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `predict_proba()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. See documentation for `TabularPredictor.predict()` and `TabularPredictor.predict_proba()`.
            kwargs :
                Arguments passed into the `predict_proba()` call for each TabularPredictor (also passed into a `predict()` call).
        """
        return self._predict(data, as_proba=True, **kwargs)

    def evaluate(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `evaluate()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to evalate predictions of all labels for, must contain all labels as columns. See documentation for `TabularPredictor.evaluate()`.
            kwargs :
                Arguments passed into the `evaluate()` call for each TabularPredictor (also passed into the `predict()` call).
        """
        data = self._get_data(data)
        eval_dict = {}
        for label in self.labels:
            print(f"Evaluating TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            
            eval_dict[label] = predictor.evaluate(data[data[label] > float('-inf')], **kwargs)
            if self.consider_labels_correlation:
                data[label] = predictor.predict(data, **kwargs)
        return eval_dict

    def save(self):
        """ Save MultilabelPredictor to disk. """
        for label in self.labels:
            if not isinstance(self.predictors[label], str):
                self.predictors[label] = self.predictors[label].path
        save_pkl.save(path=self.path+self.multi_predictor_file, object=self)
        print(f"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')")

    @classmethod
    def load(cls, path):
        """ Load MultilabelPredictor from disk `path` previously specified when creating this MultilabelPredictor. """
        path = os.path.expanduser(path)
        if path[-1] != os.path.sep:
            path = path + os.path.sep
        return load_pkl.load(path=path+cls.multi_predictor_file)

    def get_predictor(self, label):
        """ Returns TabularPredictor which is used to predict this label. """
        predictor = self.predictors[label]
        if isinstance(predictor, str):
            return TabularPredictor.load(path=predictor)
        return predictor

    def _get_data(self, data):
        if isinstance(data, str):
            return TabularDataset(data)
        return data.copy()

    def _predict(self, data, as_proba=False, **kwargs):
        data = self._get_data(data)
        if as_proba:
            predproba_dict = {}
        for i,label in enumerate(self.labels):
            print(f"Predicting with TabularPredictor for label: {label} ...{i / len(self.labels) * 100}%")
            predictor = self.get_predictor(label)
            if as_proba:
                predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)
            data[label] = predictor.predict(data, **kwargs)
        if not as_proba:
            return data[self.labels]
        else:
            return predproba_dict

In [3]:
#1 Load my ccl's ssGSEA signature
myCCLSignature = []
for name in ['sample.c2.cp.biocarta.gct',
             'sample.c2.cp.kegg.gct',
             'sample.c2.cp.pid.gct',
             'sample.c2.cp.reactome.gct',
             'sample.c2.cp.wiki.gct',
             'sample.c6.gct',
             'sample.hallmark.gct']:    
    with open(name, mode ='r')as file:
        csvFile = csv.reader(file)
        CCLSignature = list(csvFile)[2:]
        print(len(CCLSignature))
    for i, row in enumerate(CCLSignature):
        temp = CCLSignature[i][0].split('\t')
        if i > 0:
            CCLSignature[i] = [temp[0]] + [float(d) for d in temp[2:]]
        else:
            CCLSignature[i] = [temp[0]] + temp[2:]
    if not myCCLSignature:
        myCCLSignature += CCLSignature
    else:
        myCCLSignature += CCLSignature[1:]

293
187
197
1616
665
280
53


In [4]:
#2 Load CCLE ssGSEA signature
CCLECCLSignature = []
for name in ['ccle.c2.cp.biocarta.gct',
             'ccle.c2.cp.kegg.gct',
             'ccle.c2.cp.pid.gct',
             'ccle.c2.cp.reactome.gct',
             'ccle.c2.cp.wiki.gct',
             'ccle.c6.gct',
             'ccle.hallmark.gct']:
    with open(name, mode ='r')as file:
        csvFile = csv.reader(file)
        CCLSignature = list(csvFile)[2:]
        print(len(CCLSignature))
    for i, row in enumerate(CCLSignature):
        temp = CCLSignature[i][0].split('\t')
        CCLSignature[i] = [temp[0]] + temp[2:]
    if not CCLECCLSignature:
        CCLECCLSignature += CCLSignature
    else:
        CCLECCLSignature += CCLSignature[1:]

293
187
197
1616
665
280
53


In [5]:
############new 

In [6]:
#2.1 load model
modelMetaData = pd.read_csv('Model.csv', header=0)
modelMetaData = modelMetaData[['ModelID','Age','Sex']]
sex_mapping = {'Male': 1, 'Female': 0}
modelMetaData['Sex'] = modelMetaData['Sex'].map(sex_mapping)

In [7]:
metaDataDict = {}
for index, row in modelMetaData.iterrows():
    key = row['ModelID'] 
    value = [row['Age'], row['Sex']] 
    if key not in metaDataDict:
        metaDataDict[key] = value
    else:
        raise Exception("duplicated Id")

In [8]:
#2.2 load Mutation
mutationData = pd.read_csv('OmicsSomaticMutations.csv')
mutationData = mutationData[['ModelID','HugoSymbol']]

  mutationData = pd.read_csv('OmicsSomaticMutations.csv')


In [9]:
candidateKeyList = [
    'MGMT', 'IDH1', 'IDH2', 'EGFR',
    'TTN', 'MAPRE3', 'TP53', 'PIK3C2B', 'CIC', 'LRP2', 'LRP1', 'NRXN2', 'TEAD2', 'MYH3', 'NOTCH1', 'TFE3', 'PIK3R1', 'FRMD4A', 'PRCC', 'CHD3', 'BAG6', 'GLYR1', 'ADAM23', 'MSH6', 'ATRX',
    'MUC16', 'PTEN', 'NF1', 'OBSN', 'FLG', 'RYR2', 'MUC17',
    'BRAF', 'CDKN2A', 'CDKN2B', 'TERT', 'MYC'
]
candidateGeneMutationCount = {key: 0 for key in candidateKeyList}

In [10]:
mutationDataDict = {}
for index, row in mutationData.iterrows():
    key = row['ModelID']
    if key not in mutationDataDict:
        mutationDataDict[key] = candidateGeneMutationCount.copy()
    gene = row['HugoSymbol']
    if gene in candidateGeneMutationCount:
        mutationDataDict[key][row['HugoSymbol']] += 1

In [11]:
#############new end

In [12]:
#3 Load CTRP cclName to AUC map
cclToAUCdict = collections.defaultdict(list)
with open('CTRP_CCL_AUC.gct', mode ='r') as file:
    csvFile = csv.reader(file)
    CTRPCCLAUC = list(csvFile)
    CTRPCCLAUC = [''.join(sub).split('\t') for sub in CTRPCCLAUC]
    cclNames = CTRPCCLAUC[3][4:]

for i,cclName in enumerate(cclNames):
    cclToAUCdict[cclName] = [float( '-inf' if sub[4+i] == 'NaN' else sub[4+i]) for sub in CTRPCCLAUC[7:]]

In [13]:
#4 Load ccleID to ctrpName map
CCLEidToCTRPNameDict = collections.defaultdict(str)
CCLEidToDiseaseName = collections.defaultdict(str)
with open('sample_info.csv', mode ='r') as file:
    csvFile = csv.reader(file)
    mapInfos = list(csvFile)
    for mapInfo in mapInfos[1:]:
        CCLEidToCTRPNameDict[mapInfo[0]] = mapInfo[2]  
        CCLEidToDiseaseName[mapInfo[0]] = mapInfo[12]

In [14]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [15]:
# prediction data 
predictData = pd.DataFrame(data = myCCLSignature).transpose()
new_header = predictData.iloc[0] 
predictData = predictData[1:] 
predictData.columns = new_header 
predictData = predictData.apply(pd.to_numeric, errors='ignore')
predictData = predictData.set_index(['Name'])
predictData = normalize(predictData)
predictData = predictData.astype('float16')
predictData.head()

Unnamed: 0_level_0,BIOCARTA_GRANULOCYTES_PATHWAY,BIOCARTA_LYM_PATHWAY,BIOCARTA_BLYMPHOCYTE_PATHWAY,BIOCARTA_CARM_ER_PATHWAY,BIOCARTA_LAIR_PATHWAY,BIOCARTA_VDR_PATHWAY,BIOCARTA_MTA3_PATHWAY,BIOCARTA_GABA_PATHWAY,BIOCARTA_EGFR_SMRTE_PATHWAY,BIOCARTA_MONOCYTE_PATHWAY,...,HALLMARK_COAGULATION,HALLMARK_IL2_STAT5_SIGNALING,HALLMARK_BILE_ACID_METABOLISM,HALLMARK_PEROXISOME,HALLMARK_ALLOGRAFT_REJECTION,HALLMARK_SPERMATOGENESIS,HALLMARK_KRAS_SIGNALING,HALLMARK_KRAS_SIGNALING_UP,HALLMARK_KRAS_SIGNALING_DN,HALLMARK_PANCREAS_BETA_CELLS
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
THP1_d3_B.TPM,0.991211,0.985352,0.984375,0.876465,0.993652,0.595703,0.475098,0.492432,0.594238,0.986328,...,0.675781,0.810059,0.680176,0.774902,0.928711,0.793457,1.0,0.880859,0.337402,0.67041
2D_1.TPM,0.160278,0.161499,0.059387,0.379395,0.045349,1.0,0.089844,0.554199,0.363525,0.187866,...,0.025284,0.065186,0.221802,0.194458,0.019119,0.555664,0.0,0.037323,0.343262,0.248169
TRId7CWa.quant.TPM,0.438232,0.443604,0.540527,0.0,0.506836,0.303955,0.583496,0.429932,1.0,0.700195,...,0.927246,0.736328,0.484619,0.130737,0.436523,0.053558,0.354004,0.771484,0.96875,0.720703
H7.TPM,0.553711,0.5625,0.427979,1.0,0.724609,0.403809,0.853027,0.493896,0.51123,0.531738,...,0.822754,0.816406,0.113708,0.248291,0.477051,0.189209,0.544434,0.70752,0.644043,0.258057
THP1_d3_A.TPM,0.992676,0.992676,0.973633,0.795898,1.0,0.54834,0.615723,0.447266,0.474854,0.989258,...,0.691406,0.82666,0.710938,0.87793,0.928223,0.791504,0.947754,0.933105,0.476562,0.819824


In [16]:
predictData.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Index: 26 entries, THP1_d3_B.TPM to HCd7CWa.quant.TPM
Columns: 3284 entries, BIOCARTA_GRANULOCYTES_PATHWAY to HALLMARK_PANCREAS_BETA_CELLS
dtypes: float16(3284)
memory usage: 168.5 KB


In [17]:
# Prepare train set
trainData = pd.DataFrame(data = CCLECCLSignature).transpose()
new_header = trainData.iloc[0] 
trainData = trainData[1:] 
trainData.columns = new_header 

In [18]:
# filter valid ID
validSet = set()
for name in trainData['Name']:
    if CCLEidToCTRPNameDict[name] in cclToAUCdict:
        validSet.add(name)
trainData = trainData.loc[trainData['Name'].isin(validSet)]
trainData = trainData.reset_index(drop = True)
trainData = trainData.set_index(['Name'])
trainData = trainData.apply(pd.to_numeric)
trainData = trainData.astype('float16')
trainData.head()

Unnamed: 0_level_0,BIOCARTA_GRANULOCYTES_PATHWAY,BIOCARTA_LYM_PATHWAY,BIOCARTA_BLYMPHOCYTE_PATHWAY,BIOCARTA_CARM_ER_PATHWAY,BIOCARTA_LAIR_PATHWAY,BIOCARTA_VDR_PATHWAY,BIOCARTA_MTA3_PATHWAY,BIOCARTA_GABA_PATHWAY,BIOCARTA_EGFR_SMRTE_PATHWAY,BIOCARTA_MONOCYTE_PATHWAY,...,HALLMARK_COAGULATION,HALLMARK_IL2_STAT5_SIGNALING,HALLMARK_BILE_ACID_METABOLISM,HALLMARK_PEROXISOME,HALLMARK_ALLOGRAFT_REJECTION,HALLMARK_SPERMATOGENESIS,HALLMARK_KRAS_SIGNALING,HALLMARK_KRAS_SIGNALING_UP,HALLMARK_KRAS_SIGNALING_DN,HALLMARK_PANCREAS_BETA_CELLS
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-001113,-4026.0,-2440.0,-4668.0,4244.0,-2634.0,6680.0,5124.0,233.5,3240.0,-1373.0,...,-63.46875,2013.0,90.1875,4124.0,-741.5,-557.0,2440.0,-862.0,-3302.0,-1723.0
ACH-000242,-3938.0,-2622.0,-4240.0,4712.0,-3190.0,7320.0,5044.0,-1760.0,3166.0,-1523.0,...,503.75,2278.0,1100.0,4736.0,245.5,-1528.0,3270.0,-351.25,-3622.0,-2434.0
ACH-000327,-3782.0,-2314.0,-4324.0,5312.0,-2978.0,5864.0,4988.0,754.0,2148.0,-1586.0,...,-9.976562,2174.0,1878.0,5460.0,-374.5,-1132.0,2942.0,-475.25,-3416.0,-998.5
ACH-000461,-3736.0,-1105.0,-4948.0,4512.0,-2696.0,6556.0,4376.0,-550.0,3432.0,-1386.0,...,694.5,2594.0,-367.0,4192.0,313.0,-1276.0,3834.0,-95.9375,-3930.0,-1972.0
ACH-000792,-2076.0,1704.0,-1812.0,4140.0,448.75,6204.0,4704.0,-1424.0,2440.0,2108.0,...,1771.0,3038.0,-129.875,4040.0,950.0,-1170.0,3728.0,-368.75,-4096.0,-1965.0


In [19]:
trainData.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Index: 636 entries, ACH-001113 to ACH-000052
Columns: 3284 entries, BIOCARTA_GRANULOCYTES_PATHWAY to HALLMARK_PANCREAS_BETA_CELLS
dtypes: float16(3284)
memory usage: 4.0 MB


In [20]:
if not all(trainData.columns == predictData.columns):
    raise Exception("Column do not match!")

In [21]:
###new
trainData['Age'] = trainData.index.map(lambda x: metaDataDict[x][0])
trainData['Sex'] = trainData.index.map(lambda x: metaDataDict[x][1])

In [22]:
#counts matter
for geneName in candidateKeyList:
    trainData[geneName] = trainData.index.map(lambda x: 1 if x in mutationDataDict else 0)

In [23]:
trainData = normalize(trainData)

In [24]:
trainData = trainData.astype('float16')
trainData.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Index: 636 entries, ACH-001113 to ACH-000052
Columns: 3323 entries, BIOCARTA_GRANULOCYTES_PATHWAY to MYC
dtypes: float16(3323)
memory usage: 4.1 MB


In [25]:
### new end

In [26]:
#labels data
labelsDataOriginal = pd.DataFrame(columns = [sub[1] for sub in CTRPCCLAUC[7:]])
for name in trainData.index:
    labelsDataOriginal.loc[len(labelsDataOriginal.index)] = cclToAUCdict[CCLEidToCTRPNameDict[name]]
labelsDataOriginal = labelsDataOriginal.set_index(trainData.index)
labelsDataOriginal = labelsDataOriginal.astype('float16')

In [27]:
labelsDataOriginal.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Index: 636 entries, ACH-001113 to ACH-000052
Columns: 481 entries, zebularine to GSK-J4
dtypes: float16(481)
memory usage: 639.1 KB


In [28]:
#add high priority at the begining. e.g dasatinib
labels = list(labelsDataOriginal.columns)

In [29]:
#constants
problem_types = ['regression'] 
eval_metrics = ['mean_squared_error']
time_limit = 60 * 60 * 24
tops = [100,300,600]

In [33]:
for label in labels:
    try:
        if len(os.listdir(label)) > 1:
             continue
    except:
        print('Working on' + label)
    
    labelData = labelsDataOriginal[[label]]    
    trainDataSet = pd.concat([trainData, labelData], axis = 1)
    
    #first time training 
    save_path = label + '/' + 'GiloML_predictDrugAUC_Full_Feature_Medium_Quality_Model_' + label 
    multi_predictor = MultilabelPredictor(labels=[label], problem_types=problem_types, eval_metrics=eval_metrics, path=save_path)
    multi_predictor.fit(trainDataSet, time_limit=time_limit)
    #multi_predictor = MultilabelPredictor.load(save_path)
    #result = multi_predictor.predict(predictData)
    #result.to_csv(label + '/' + 'GiloML_predictDrugAUC_Full_Feature_Medium_Quality_Result_' + label+ '.csv')
    
    #get feature importance
    predictor = multi_predictor.get_predictor(label)
    feature_importance = predictor.feature_importance(trainDataSet[trainDataSet[label] > float('-inf')], num_shuffle_sets = 3)
    feature_importance.to_csv(label + '/' + "GlioML_feature_importance_" + label + ".csv")
    
    #only consider feature has positive contribution
    feature_importance = feature_importance[feature_importance['importance'] > 0]
    
#     #Evaluation
#     evaaluateResult = []
#     for top in tops:
#         reducedfeatures = list(feature_importance[:top].index)
#         trainDataR =  trainData[reducedfeatures]
#         trainDataSetR = pd.concat([trainDataR,labelData],axis = 1)
#         save_path_R = label + '/' + 'GlioML_ReducedFeature_' + str(top) + '_' + label + '_FeatureEvaluationModel'
#         train, evaldata = train_test_split(trainDataSetR,test_size = 0.2,random_state=1112)
#         predictor_single = MultilabelPredictor(labels=[label], problem_types=problem_types, eval_metrics=eval_metrics, path=save_path_R)
#         predictor_single.fit(train, time_limit=time_limit)
#         #predictor_single = MultilabelPredictor.load(save_path_R)
#         evaluationData = predictor_single.evaluate(evaldata)
#         evaaluateResult.append(evaluationData[label]['mean_squared_error'])
#         if(top == 100):
#             save_path_final = label + '/' + 'GiloML_predictDrugAUC_Top_100_Features_Best_Quality_Full_Data_Model_' + label
#             predictor_single = MultilabelPredictor(labels = [label], problem_types = problem_types, eval_metrics = eval_metrics, path = save_path_final)
#             predictor_single.fit(trainDataSetR, final = True, time_limit = time_limit)
#             #result_final = predictor_single.predict(predictData[reducedfeatures])
#             #result_final.to_csv(save_path_final + "_Result.csv")
    
#     maxPerformanceFeatureCount = tops[evaaluateResult.index(max(evaaluateResult))]
#     evaaluateResult = [['Component Name','TOP100','TOP300','TOP600'], [label] + evaaluateResult]
#     with open(label + '/' + 'GlioML_ReducedFeature_evalScore_' + label + '.csv', 'w', newline='') as csvfile:
#         writer = csv.writer(csvfile)
#         writer.writerows(evaaluateResult)
    
#     if maxPerformanceFeatureCount > 100: 
#         reducedfeatures = list(feature_importance[:maxPerformanceFeatureCount].index)
#         trainDataR =  trainData[reducedfeatures]
#         trainDataSetR = pd.concat([trainDataR,labelData],axis = 1)
#         save_path_final = label + '/' + 'GiloML_predictDrugAUC_Top_' + str(maxPerformanceFeatureCount) +'_Features_Best_Quality_Full_Data_Model_' + label
#         predictor_single = MultilabelPredictor(labels = [label], problem_types = problem_types, eval_metrics = eval_metrics, path = save_path_final)
#         predictor_single.fit(trainDataSetR, final = True, time_limit = time_limit)
#         #result_final = predictor_single.predict(predictData[reducedfeatures])
#         #result_final.to_csv(save_path_final + "_Result.csv")


Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 86400s
AutoGluon will save models to "erismodegib/GiloML_predictDrugAUC_Full_Feature_Medium_Quality_Model_erismodegib\Predictor_erismodegib\"
AutoGluon Version:  0.8.0
Python Version:     3.9.0
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22000
Disk Space Avail:   583.14 GB / 785.12 GB (74.3%)
Train Data Rows:    603
Train Data Columns: 3323
Label Column: erismodegib
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...


Fitting TabularPredictor for label: erismodegib ...0.0%
8.347431818644206e-05 minutes


	Available Memory:                    58674.15 MB
	Train Data (Original)  Memory Usage: 4.01 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 37 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Unused Original Features (Count: 41): ['WP_INSULIN_SIGNALING_IN_ADIPOCYTES_DIABETIC_CONDITION', 'WP_CELLS_AND_MOLECULES_INVOLVED_IN_LOCAL_ACUTE_INFLAMMATORY_RESPONSE', 'WP_PATHOGENIC_ESCHERICHIA_COLI_INFECTION', 'WP_ARRHYTHMOGENIC_RIGHT_VENTRICULAR_CARDIOMYOPATHY', 'WP_ENDOCHONDRAL_OSSIFICATION', 'IDH1', 'IDH2', 'EGFR', 'TTN', 'MAPRE3', 'TP53', 'PIK3C

MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('erismodegib/GiloML_predictDrugAUC_Full_Feature_Medium_Quality_Model_erismodegib\')


	2413.25s	= Expected runtime (804.42s per shuffle set)


MemoryError: Unable to allocate 2.95 GiB for an array with shape (120600, 3282) and data type float64