In [1]:
import csv
import collections
import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.common.utils.utils import setup_outputdir
from autogluon.core.utils.loaders import load_pkl
from autogluon.core.utils.savers import save_pkl
from autogluon.tabular import FeatureMetadata
import os.path
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
import seaborn as sns
from random import sample

In [2]:
class MultilabelPredictor():
    """ Tabular Predictor for predicting multiple columns in table.
        Creates multiple TabularPredictor objects which you can also use individually.
        You can access the TabularPredictor for a particular label via: `multilabel_predictor.get_predictor(label_i)`

        Parameters
        ----------
        labels : List[str]
            The ith element of this list is the column (i.e. `label`) predicted by the ith TabularPredictor stored in this object.
        path : str, default = None
            Path to directory where models and intermediate outputs should be saved.
            If unspecified, a time-stamped folder called "AutogluonModels/ag-[TIMESTAMP]" will be created in the working directory to store all models.
            Note: To call `fit()` twice and save all results of each fit, you must specify different `path` locations or don't specify `path` at all.
            Otherwise files from first `fit()` will be overwritten by second `fit()`.
            Caution: when predicting many labels, this directory may grow large as it needs to store many TabularPredictors.
        problem_types : List[str], default = None
            The ith element is the `problem_type` for the ith TabularPredictor stored in this object.
        eval_metrics : List[str], default = None
            The ith element is the `eval_metric` for the ith TabularPredictor stored in this object.
        consider_labels_correlation : bool, default = True
            Whether the predictions of multiple labels should account for label correlations or predict each label independently of the others.
            If True, the ordering of `labels` may affect resulting accuracy as each label is predicted conditional on the previous labels appearing earlier in this list (i.e. in an auto-regressive fashion).
            Set to False if during inference you may want to individually use just the ith TabularPredictor without predicting all the other labels.
        kwargs :
            Arguments passed into the initialization of each TabularPredictor.

    """

    multi_predictor_file = 'multilabel_predictor.pkl'

    def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=False, **kwargs):
        if (problem_types is not None) and (len(problem_types) != len(labels)):
            raise ValueError("If provided, `problem_types` must have same length as `labels`")
        if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):
            raise ValueError("If provided, `eval_metrics` must have same length as `labels`")
        self.path = setup_outputdir(path, warn_if_exist=False)
        self.labels = labels
        self.consider_labels_correlation = consider_labels_correlation
        self.predictors = {}  # key = label, value = TabularPredictor or str path to the TabularPredictor for this label
        if eval_metrics is None:
            self.eval_metrics = {}
        else:
            self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}
        problem_type = None
        eval_metric = None
        for i in range(len(labels)):
            label = labels[i]
            path_i = self.path + "Predictor_" + label
            if problem_types is not None:
                problem_type = problem_types[i]
            if eval_metrics is not None:
                eval_metric = eval_metrics[i]
            self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)

    def fit(self, train_data, tuning_data=None, final = False, **kwargs):
        """ Fits a separate TabularPredictor to predict each of the labels.

            Parameters
            ----------
            train_data, tuning_data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                See documentation for `TabularPredictor.fit()`.
            kwargs :
                Arguments passed into the `fit()` call for each TabularPredictor.
        """
        if isinstance(train_data, str):
            train_data = TabularDataset(train_data)
        if tuning_data is not None and isinstance(tuning_data, str):
            tuning_data = TabularDataset(tuning_data)
        train_data_og = train_data.copy()
        if tuning_data is not None:
            tuning_data_og = tuning_data.copy()
        else:
            tuning_data_og = None
        save_metrics = len(self.eval_metrics) == 0
        start = time.time()
        for i in range(len(self.labels)):
            label = self.labels[i]
            predictor = self.get_predictor(label)
            if not self.consider_labels_correlation:
                labels_to_drop = [l for l in self.labels if l != label]
            else:
                labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]
            train_data = train_data_og.drop(labels_to_drop, axis=1)
            if tuning_data is not None:
                tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)
            print(f"Fitting TabularPredictor for label: {label} ...{i / len(self.labels) * 100}%")
            print(f"{(time.time() - start) / 60} minutes")
            if (final):
                predictor.fit(train_data=train_data[train_data[label] > float('-inf')]
                              , tuning_data = tuning_data
                              ,presets = 'best_quality'
                              ,num_bag_folds = 5,num_bag_sets = 2
                              ,feature_generator=None
                              , **kwargs)
            else:
                predictor.fit(train_data=train_data[train_data[label] > float('-inf')]
                              , tuning_data = tuning_data
                              ,presets = 'medium_quality'
                              ,feature_generator=None
                              #,presets = 'best_quality'
                              #,num_bag_folds = 5,num_bag_sets = 2
                              , **kwargs)
            self.predictors[label] = predictor.path
            if save_metrics:
                self.eval_metrics[label] = predictor.eval_metric
        self.save()

    def predict(self, data, **kwargs):
        """ Returns DataFrame with label columns containing predictions for each label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. If label columns are present in this data, they will be ignored. See documentation for `TabularPredictor.predict()`.
            kwargs :
                Arguments passed into the predict() call for each TabularPredictor.
        """
        return self._predict(data, as_proba=False, **kwargs)

    def predict_proba(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `predict_proba()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. See documentation for `TabularPredictor.predict()` and `TabularPredictor.predict_proba()`.
            kwargs :
                Arguments passed into the `predict_proba()` call for each TabularPredictor (also passed into a `predict()` call).
        """
        return self._predict(data, as_proba=True, **kwargs)

    def evaluate(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `evaluate()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to evalate predictions of all labels for, must contain all labels as columns. See documentation for `TabularPredictor.evaluate()`.
            kwargs :
                Arguments passed into the `evaluate()` call for each TabularPredictor (also passed into the `predict()` call).
        """
        data = self._get_data(data)
        eval_dict = {}
        for label in self.labels:
            print(f"Evaluating TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            
            eval_dict[label] = predictor.evaluate(data[data[label] > float('-inf')], **kwargs)
            if self.consider_labels_correlation:
                data[label] = predictor.predict(data, **kwargs)
        return eval_dict

    def save(self):
        """ Save MultilabelPredictor to disk. """
        for label in self.labels:
            if not isinstance(self.predictors[label], str):
                self.predictors[label] = self.predictors[label].path
        save_pkl.save(path=self.path+self.multi_predictor_file, object=self)
        print(f"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')")

    @classmethod
    def load(cls, path):
        """ Load MultilabelPredictor from disk `path` previously specified when creating this MultilabelPredictor. """
        path = os.path.expanduser(path)
        if path[-1] != os.path.sep:
            path = path + os.path.sep
        return load_pkl.load(path=path+cls.multi_predictor_file)

    def get_predictor(self, label):
        """ Returns TabularPredictor which is used to predict this label. """
        predictor = self.predictors[label]
        if isinstance(predictor, str):
            return TabularPredictor.load(path=predictor)
        return predictor

    def _get_data(self, data):
        if isinstance(data, str):
            return TabularDataset(data)
        return data.copy()

    def _predict(self, data, as_proba=False, **kwargs):
        data = self._get_data(data)
        if as_proba:
            predproba_dict = {}
        for i,label in enumerate(self.labels):
            print(f"Predicting with TabularPredictor for label: {label} ...{i / len(self.labels) * 100}%")
            predictor = self.get_predictor(label)
            if as_proba:
                predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)
            data[label] = predictor.predict(data, **kwargs)
        if not as_proba:
            return data[self.labels]
        else:
            return predproba_dict

In [3]:
#1 Load my ccl's ssGSEA signature
myCCLSignature = []
for name in ['sample.c2.cp.biocarta.gct',
             'sample.c2.cp.kegg.gct',
             'sample.c2.cp.pid.gct',
             'sample.c2.cp.reactome.gct',
             'sample.c2.cp.wiki.gct',
             'sample.c6.gct',
             'sample.hallmark.gct']:    
    with open(name, mode ='r')as file:
        csvFile = csv.reader(file)
        CCLSignature = list(csvFile)[2:]
        print(len(CCLSignature))
    for i, row in enumerate(CCLSignature):
        temp = CCLSignature[i][0].split('\t')
        if i > 0:
            CCLSignature[i] = [temp[0]] + [float(d) for d in temp[2:]]
        else:
            CCLSignature[i] = [temp[0]] + temp[2:]
    if not myCCLSignature:
        myCCLSignature += CCLSignature
    else:
        myCCLSignature += CCLSignature[1:]

293
187
197
1616
665
280
53


In [4]:
#2 Load CCLE ssGSEA signature
CCLECCLSignature = []
for name in ['ccle.c2.cp.biocarta.gct',
             'ccle.c2.cp.kegg.gct',
             'ccle.c2.cp.pid.gct',
             'ccle.c2.cp.reactome.gct',
             'ccle.c2.cp.wiki.gct',
             'ccle.c6.gct',
             'ccle.hallmark.gct']:
    with open(name, mode ='r')as file:
        csvFile = csv.reader(file)
        CCLSignature = list(csvFile)[2:]
        print(len(CCLSignature))
    for i, row in enumerate(CCLSignature):
        temp = CCLSignature[i][0].split('\t')
        CCLSignature[i] = [temp[0]] + temp[2:]
    if not CCLECCLSignature:
        CCLECCLSignature += CCLSignature
    else:
        CCLECCLSignature += CCLSignature[1:]

293
187
197
1616
665
280
53


In [5]:
############new 

In [6]:
#2.1 load model
modelMetaData = pd.read_csv('Model.csv', header=0)
modelMetaData = modelMetaData[['ModelID','Age','Sex']]
sex_mapping = {'Male': 1, 'Female': 0}
modelMetaData['Sex'] = modelMetaData['Sex'].map(sex_mapping)

In [7]:
metaDataDict = {}
for index, row in modelMetaData.iterrows():
    key = row['ModelID'] 
    value = [row['Age'], row['Sex']] 
    if key not in metaDataDict:
        metaDataDict[key] = value
    else:
        raise Exception("duplicated Id")

In [8]:
#2.2 load Mutation
mutationData = pd.read_csv('OmicsSomaticMutations.csv')
mutationData = mutationData[['ModelID','HugoSymbol']]

  mutationData = pd.read_csv('OmicsSomaticMutations.csv')


In [9]:
candidateKeyList = [
    'MGMT', 'IDH1', 'IDH2', 'EGFR',
    'TTN', 'MAPRE3', 'TP53', 'PIK3C2B', 'CIC', 'LRP2', 'LRP1', 'NRXN2', 'TEAD2', 'MYH3', 'NOTCH1', 'TFE3', 'PIK3R1', 'FRMD4A', 'PRCC', 'CHD3', 'BAG6', 'GLYR1', 'ADAM23', 'MSH6', 'ATRX',
    'MUC16', 'PTEN', 'NF1', 'OBSN', 'FLG', 'RYR2', 'MUC17',
    'BRAF', 'CDKN2A', 'CDKN2B', 'TERT', 'MYC'
]
candidateGeneMutationCount = {key: 0 for key in candidateKeyList}

In [10]:
mutationDataDict = {}
for index, row in mutationData.iterrows():
    key = row['ModelID']
    if key not in mutationDataDict:
        mutationDataDict[key] = candidateGeneMutationCount.copy()
    gene = row['HugoSymbol']
    if gene in candidateGeneMutationCount:
        mutationDataDict[key][row['HugoSymbol']] += 1

In [11]:
#############new end

In [12]:
#3 Load CTRP cclName to AUC map
cclToAUCdict = collections.defaultdict(list)
with open('CTRP_CCL_AUC.gct', mode ='r') as file:
    csvFile = csv.reader(file)
    CTRPCCLAUC = list(csvFile)
    CTRPCCLAUC = [''.join(sub).split('\t') for sub in CTRPCCLAUC]
    cclNames = CTRPCCLAUC[3][4:]

for i,cclName in enumerate(cclNames):
    cclToAUCdict[cclName] = [float( '-inf' if sub[4+i] == 'NaN' else sub[4+i]) for sub in CTRPCCLAUC[7:]]

In [13]:
#4 Load ccleID to ctrpName map
CCLEidToCTRPNameDict = collections.defaultdict(str)
CCLEidToDiseaseName = collections.defaultdict(str)
with open('sample_info.csv', mode ='r') as file:
    csvFile = csv.reader(file)
    mapInfos = list(csvFile)
    for mapInfo in mapInfos[1:]:
        CCLEidToCTRPNameDict[mapInfo[0]] = mapInfo[2]  
        CCLEidToDiseaseName[mapInfo[0]] = mapInfo[12]

In [14]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [15]:
# prediction data 
predictData = pd.DataFrame(data = myCCLSignature).transpose()
new_header = predictData.iloc[0] 
predictData = predictData[1:] 
predictData.columns = new_header 
predictData = predictData.apply(pd.to_numeric, errors='ignore')
predictData = predictData.set_index(['Name'])
predictData = normalize(predictData)
predictData = predictData.astype('float16')
predictData.head()

Unnamed: 0_level_0,BIOCARTA_GRANULOCYTES_PATHWAY,BIOCARTA_LYM_PATHWAY,BIOCARTA_BLYMPHOCYTE_PATHWAY,BIOCARTA_CARM_ER_PATHWAY,BIOCARTA_LAIR_PATHWAY,BIOCARTA_VDR_PATHWAY,BIOCARTA_MTA3_PATHWAY,BIOCARTA_GABA_PATHWAY,BIOCARTA_EGFR_SMRTE_PATHWAY,BIOCARTA_MONOCYTE_PATHWAY,...,HALLMARK_COAGULATION,HALLMARK_IL2_STAT5_SIGNALING,HALLMARK_BILE_ACID_METABOLISM,HALLMARK_PEROXISOME,HALLMARK_ALLOGRAFT_REJECTION,HALLMARK_SPERMATOGENESIS,HALLMARK_KRAS_SIGNALING,HALLMARK_KRAS_SIGNALING_UP,HALLMARK_KRAS_SIGNALING_DN,HALLMARK_PANCREAS_BETA_CELLS
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
THP1_d3_B.TPM,0.991211,0.985352,0.984375,0.876465,0.993652,0.595703,0.475098,0.492432,0.594238,0.986328,...,0.675781,0.810059,0.680176,0.774902,0.928711,0.793457,1.0,0.880859,0.337402,0.67041
2D_1.TPM,0.160278,0.161499,0.059387,0.379395,0.045349,1.0,0.089844,0.554199,0.363525,0.187866,...,0.025284,0.065186,0.221802,0.194458,0.019119,0.555664,0.0,0.037323,0.343262,0.248169
TRId7CWa.quant.TPM,0.438232,0.443604,0.540527,0.0,0.506836,0.303955,0.583496,0.429932,1.0,0.700195,...,0.927246,0.736328,0.484619,0.130737,0.436523,0.053558,0.354004,0.771484,0.96875,0.720703
H7.TPM,0.553711,0.5625,0.427979,1.0,0.724609,0.403809,0.853027,0.493896,0.51123,0.531738,...,0.822754,0.816406,0.113708,0.248291,0.477051,0.189209,0.544434,0.70752,0.644043,0.258057
THP1_d3_A.TPM,0.992676,0.992676,0.973633,0.795898,1.0,0.54834,0.615723,0.447266,0.474854,0.989258,...,0.691406,0.82666,0.710938,0.87793,0.928223,0.791504,0.947754,0.933105,0.476562,0.819824


In [16]:
predictData.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Index: 26 entries, THP1_d3_B.TPM to HCd7CWa.quant.TPM
Columns: 3284 entries, BIOCARTA_GRANULOCYTES_PATHWAY to HALLMARK_PANCREAS_BETA_CELLS
dtypes: float16(3284)
memory usage: 168.5 KB


In [17]:
# Prepare train set
trainData = pd.DataFrame(data = CCLECCLSignature).transpose()
new_header = trainData.iloc[0] 
trainData = trainData[1:] 
trainData.columns = new_header 

In [18]:
# filter valid ID
validSet = set()
for name in trainData['Name']:
    if CCLEidToCTRPNameDict[name] in cclToAUCdict:
        validSet.add(name)
trainData = trainData.loc[trainData['Name'].isin(validSet)]
trainData = trainData.reset_index(drop = True)
trainData = trainData.set_index(['Name'])
trainData = trainData.apply(pd.to_numeric)
trainData.head()

Unnamed: 0_level_0,BIOCARTA_GRANULOCYTES_PATHWAY,BIOCARTA_LYM_PATHWAY,BIOCARTA_BLYMPHOCYTE_PATHWAY,BIOCARTA_CARM_ER_PATHWAY,BIOCARTA_LAIR_PATHWAY,BIOCARTA_VDR_PATHWAY,BIOCARTA_MTA3_PATHWAY,BIOCARTA_GABA_PATHWAY,BIOCARTA_EGFR_SMRTE_PATHWAY,BIOCARTA_MONOCYTE_PATHWAY,...,HALLMARK_COAGULATION,HALLMARK_IL2_STAT5_SIGNALING,HALLMARK_BILE_ACID_METABOLISM,HALLMARK_PEROXISOME,HALLMARK_ALLOGRAFT_REJECTION,HALLMARK_SPERMATOGENESIS,HALLMARK_KRAS_SIGNALING,HALLMARK_KRAS_SIGNALING_UP,HALLMARK_KRAS_SIGNALING_DN,HALLMARK_PANCREAS_BETA_CELLS
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-001113,-4026.645913,-2439.765461,-4667.433695,4245.632425,-2634.558718,6679.038788,5122.943551,233.525589,3240.420783,-1373.036419,...,-63.480372,2013.491371,90.175905,4123.227172,-741.313831,-557.117109,2439.206531,-861.816581,-3301.023113,-1722.977475
ACH-000242,-3937.716661,-2621.45277,-4241.404568,4712.036616,-3189.755692,7318.928946,5045.464864,-1759.62978,3165.594813,-1523.257437,...,503.868738,2278.095889,1099.536405,4735.463738,245.521668,-1527.711407,3270.470271,-351.167918,-3621.638189,-2433.575233
ACH-000327,-3781.416152,-2313.928569,-4325.047772,5313.421286,-2977.444387,5864.860635,4989.815628,754.09607,2148.453587,-1585.84287,...,-9.972973,2174.382638,1877.876002,5461.67688,-374.525043,-1131.652527,2941.386241,-475.140822,-3416.527063,-998.619342
ACH-000461,-3736.974891,-1104.574918,-4948.954189,4512.332186,-2696.549331,6554.031705,4377.850873,-550.037643,3431.112552,-1386.421102,...,694.296581,2594.017464,-367.00847,4193.268771,312.949334,-1275.923638,3834.275855,-95.907985,-3930.18384,-1971.890027
ACH-000792,-2075.019365,1703.994143,-1812.061451,4141.948015,448.719648,6202.852062,4702.84257,-1424.053292,2439.967871,2108.238933,...,1771.136924,3038.181715,-129.83929,4039.550791,949.878642,-1170.344231,3728.075095,-368.635379,-4096.710474,-1964.779113


In [19]:
if not all(trainData.columns == predictData.columns):
    raise Exception("Column do not match!")

In [20]:
###new
trainData['Age'] = trainData.index.map(lambda x: metaDataDict[x][0])
trainData['Sex'] = trainData.index.map(lambda x: metaDataDict[x][1])

In [21]:
#counts not matter
for geneName in candidateKeyList:
    trainData[geneName] = trainData.index.map(lambda x: 1 if x in mutationDataDict and mutationDataDict[x][geneName] > 0 else 0)

In [22]:
trainData = normalize(trainData)

In [23]:
trainData = trainData.astype('float16')

In [24]:
### new end

In [25]:
#labels data
labelsDataOriginal = pd.DataFrame(columns = [sub[1] for sub in CTRPCCLAUC[7:]])
for name in trainData.index:
    labelsDataOriginal.loc[len(labelsDataOriginal.index)] = cclToAUCdict[CCLEidToCTRPNameDict[name]]
labelsDataOriginal = labelsDataOriginal.set_index(trainData.index)
labelsDataOriginal = labelsDataOriginal.astype('float16')

In [26]:
#add high priority at the begining. e.g dasatinib
labels = list(labelsDataOriginal.columns)
labels = ['lovastatin','1S3R-RSL-3','daporinad','docetaxel','erlotinib','JW-55','selumetinib','trametinib'] + labels

In [27]:
#constants
problem_types = ['regression'] 
eval_metrics = ['mean_squared_error']
time_limit = 60 * 60 * 24
tops = [100,300,600]

In [28]:
trainData.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Index: 636 entries, ACH-001113 to ACH-000052
Columns: 3323 entries, BIOCARTA_GRANULOCYTES_PATHWAY to MYC
dtypes: float16(3323)
memory usage: 4.1 MB


In [29]:
skipped = []
valScoreReuslt = [['Component','WeightedEnsemble_L2', 'NeuralNetFastAI','LightGBMLarge','LightGBM','NeuralNetTorch','RandomForestMSE','ExtraTreesMSE','LightGBMXT','CatBoost','KNeighborsDist','KNeighborsUnif','XGBoost']]

In [30]:
for label in labels:
    if label == 'neuronal differentiation inducer III':
        continue  # Skip the 'neuronal differentiation inducer III' label and continue with the next iteration
#     try:
#         if len(os.listdir(label)) > 0:
#              continue
#     except:
#         print('Working on first time training of ' + label)
    
    labelData = labelsDataOriginal[[label]]    
    trainDataSet = pd.concat([trainData, labelData], axis = 1)
    
    #first time training 
    save_path = label + '/' + 'GiloML_predictDrugAUC_Full_Feature_Medium_Quality_Model_' + label 
    multi_predictor = MultilabelPredictor(labels=[label], problem_types=problem_types, eval_metrics=eval_metrics, path=save_path)
    #multi_predictor.fit(trainDataSet, time_limit=time_limit)
    multi_predictor = MultilabelPredictor.load(save_path) 
    try:
        predictor = multi_predictor.get_predictor(label)
        modleValScore = predictor.leaderboard()
    except:
        skipped.append(label)
        continue
    modelScoreDict = {}
    for index, row in modleValScore.iterrows():
        modelScoreDict[row['model']] = row['score_val']
    temp = [label]
    for model in valScoreReuslt[0][1:]:
        temp.append(modelScoreDict[model])
    valScoreReuslt += [temp]
    
    #result = multi_predictor.predict(predictData)
    #result.to_csv(label + '/' + 'GiloML_predictDrugAUC_Full_Feature_Medium_Quality_Result_' + label+ '.csv')
    
#     #get feature importance
#     predictor = multi_predictor.get_predictor(label)
#     feature_importance = predictor.feature_importance(trainDataSet[trainDataSet[label] > float('-inf')].astype('float16'), num_shuffle_sets = 3)
#     feature_importance.to_csv(label + '/' + "GlioML_feature_importance_" + label + ".csv")
    
#     del labelData, trainDataSet, feature_importance, multi_predictor
#     #only consider feature has positive contribution
#     feature_importance = feature_importance[feature_importance['importance'] > 0]
    
#     #Evaluation
#     evaaluateResult = []
#     for top in tops:
#         reducedfeatures = list(feature_importance[:top].index)
#         trainDataR =  trainData[reducedfeatures]
#         trainDataSetR = pd.concat([trainDataR,labelData],axis = 1)
#         save_path_R = label + '/' + 'GlioML_ReducedFeature_' + str(top) + '_' + label + '_FeatureEvaluationModel'
#         train, evaldata = train_test_split(trainDataSetR,test_size = 0.2,random_state=1112)
#         predictor_single = MultilabelPredictor(labels=[label], problem_types=problem_types, eval_metrics=eval_metrics, path=save_path_R)
#         predictor_single.fit(train, time_limit=time_limit)
#         #predictor_single = MultilabelPredictor.load(save_path_R)
#         evaluationData = predictor_single.evaluate(evaldata)
#         evaaluateResult.append(evaluationData[label]['mean_squared_error'])
#         if(top == 100):
#             save_path_final = label + '/' + 'GiloML_predictDrugAUC_Top_100_Features_Best_Quality_Full_Data_Model_' + label
#             predictor_single = MultilabelPredictor(labels = [label], problem_types = problem_types, eval_metrics = eval_metrics, path = save_path_final)
#             predictor_single.fit(trainDataSetR, final = True, time_limit = time_limit)
#             #result_final = predictor_single.predict(predictData[reducedfeatures])
#             #result_final.to_csv(save_path_final + "_Result.csv")
    
#     maxPerformanceFeatureCount = tops[evaaluateResult.index(max(evaaluateResult))]
#     evaaluateResult = [['Component Name','TOP100','TOP300','TOP600'], [label] + evaaluateResult]
#     with open(label + '/' + 'GlioML_ReducedFeature_evalScore_' + label + '.csv', 'w', newline='') as csvfile:
#         writer = csv.writer(csvfile)
#         writer.writerows(evaaluateResult)
    
#     if maxPerformanceFeatureCount > 100: 
#         reducedfeatures = list(feature_importance[:maxPerformanceFeatureCount].index)
#         trainDataR =  trainData[reducedfeatures]
#         trainDataSetR = pd.concat([trainDataR,labelData],axis = 1)
#         save_path_final = label + '/' + 'GiloML_predictDrugAUC_Top_' + str(maxPerformanceFeatureCount) +'_Features_Best_Quality_Full_Data_Model_' + label
#         predictor_single = MultilabelPredictor(labels = [label], problem_types = problem_types, eval_metrics = eval_metrics, path = save_path_final)
#         predictor_single.fit(trainDataSetR, final = True, time_limit = time_limit)
#         #result_final = predictor_single.predict(predictData[reducedfeatures])
#         #result_final.to_csv(save_path_final + "_Result.csv")




                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -2.509298       0.399004  261.396693                0.000999           0.222000            2       True         12
1              CatBoost  -2.540507       0.113001  138.389594                0.113001         138.389594            1       True          6
2         ExtraTreesMSE  -2.569937       0.049001    8.682175                0.049001           8.682175            1       True          7
3        NeuralNetTorch  -2.591213       0.210001   16.094831                0.210001          16.094831            1       True         10
4            LightGBMXT  -2.604052       0.015000    8.665190                0.015000           8.665190            1       True          3
5       RandomForestMSE  -2.678971       0.048000   49.272099                0.048000          49.272099            1       True          5
6              Light



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2 -10.145541       0.153007  32.573600                0.000000           0.092001            2       True         12
1        NeuralNetTorch -10.423170       0.088003   2.791130                0.088003           2.791130            1       True         10
2            LightGBMXT -10.866522       0.017000   2.191098                0.017000           2.191098            1       True          3
3         ExtraTreesMSE -10.999194       0.033003   3.382121                0.033003           3.382121            1       True          7
4       RandomForestMSE -11.023846       0.033004  16.877604                0.033004          16.877604            1       True          5
5              CatBoost -11.076791       0.052001  63.413526                0.052001          63.413526            1       True          6
6               XGBoost -11



                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -0.431670       0.087002   10.433913                0.001000           0.222003            2       True         12
1               XGBoost  -0.495153       0.010000    7.895186                0.010000           7.895186            1       True          9
2        KNeighborsUnif  -0.526800       0.051001    0.605008                0.051001           0.605008            1       True          1
3        KNeighborsDist  -0.526941       0.061002    0.608006                0.061002           0.608006            1       True          2
4        NeuralNetTorch  -0.557458       0.202000    7.226160                0.202000           7.226160            1       True         10
5         LightGBMLarge  -0.558285       0.014000    3.592611                0.014000           3.592611            1       True         11
6              Light



                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -5.782718       0.300003   44.834750                0.001000           0.221002            2       True         12
1        NeuralNetTorch  -6.133147       0.181001   17.997886                0.181001          17.997886            1       True         10
2            LightGBMXT  -6.365355       0.015000    3.628204                0.015000           3.628204            1       True          3
3       NeuralNetFastAI  -6.560314       0.021000    5.012711                0.021000           5.012711            1       True          8
4         ExtraTreesMSE  -6.726813       0.049001    5.140593                0.049001           5.140593            1       True          7
5       RandomForestMSE  -6.836134       0.048001   19.252557                0.048001          19.252557            1       True          5
6              CatBo



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -1.206328       0.064000  39.494820                0.000000           0.095000            2       True         12
1              CatBoost  -1.206328       0.064000  39.399821                0.064000          39.399821            1       True          6
2            LightGBMXT  -1.208622       0.015002   1.877018                0.015002           1.877018            1       True          3
3              LightGBM  -1.213405       0.012000   3.710302                0.012000           3.710302            1       True          4
4         LightGBMLarge  -1.215868       0.009999  15.008539                0.009999          15.008539            1       True         11
5       NeuralNetFastAI  -1.227280       0.016000   1.328050                0.016000           1.328050            1       True          8
6       RandomForestMSE  -1



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -1.713957       0.180006  48.484635                0.000000           0.093001            2       True         12
1        NeuralNetTorch  -1.762634       0.091002   2.961422                0.091002           2.961422            1       True         10
2       RandomForestMSE  -1.766092       0.030000  20.926117                0.030000          20.926117            1       True          5
3         LightGBMLarge  -1.770866       0.012001  22.846858                0.012001          22.846858            1       True         11
4               XGBoost  -1.806727       0.018002  22.346346                0.018002          22.346346            1       True          9
5              LightGBM  -1.811143       0.011001   3.917173                0.011001           3.917173            1       True          4
6         ExtraTreesMSE  -1



                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -0.544195       0.303056  107.930695                0.001000           0.221003            2       True         12
1              CatBoost  -0.548631       0.115048   90.226167                0.115048          90.226167            1       True          6
2            LightGBMXT  -0.553497       0.015000    5.231876                0.015000           5.231876            1       True          3
3              LightGBM  -0.553664       0.014001   12.576824                0.014001          12.576824            1       True          4
4         LightGBMLarge  -0.557429       0.015000   41.060748                0.015000          41.060748            1       True         11
5         ExtraTreesMSE  -0.558129       0.048001    9.419322                0.048001           9.419322            1       True          7
6       RandomForest



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -0.662496       0.033000   6.445130                0.000000           0.220001            2       True         12
1            LightGBMXT  -0.671377       0.013000   2.341070                0.013000           2.341070            1       True          3
2              LightGBM  -0.687250       0.014001   3.393537                0.014001           3.393537            1       True          4
3              CatBoost  -0.692999       0.074000  65.255523                0.074000          65.255523            1       True          6
4         LightGBMLarge  -0.695231       0.013000  10.004632                0.013000          10.004632            1       True         11
5        NeuralNetTorch  -0.704347       0.178000  10.226312                0.178000          10.226312            1       True         10
6       RandomForestMSE  -0



                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -0.902132       0.085024  122.658003                0.001000           0.224003            2       True         12
1              LightGBM  -0.907339       0.015998   12.193335                0.015998          12.193335            1       True          4
2            LightGBMXT  -0.916592       0.018001    5.705662                0.018001           5.705662            1       True          3
3       RandomForestMSE  -0.917220       0.046025   65.714716                0.046025          65.714716            1       True          5
4              CatBoost  -0.926992       0.179003   79.432176                0.179003          79.432176            1       True          6
5         LightGBMLarge  -0.928520       0.017018   44.538182                0.017018          44.538182            1       True         11
6       NeuralNetFas



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -1.529341       0.253527  53.852543                0.000000           0.226000            2       True         12
1            LightGBMXT  -1.553419       0.018000   6.772412                0.018000           6.772412            1       True          3
2        NeuralNetTorch  -1.575069       0.217527  30.355789                0.217527          30.355789            1       True         10
3              LightGBM  -1.598445       0.018000  16.498343                0.018000          16.498343            1       True          4
4              CatBoost  -1.609810       0.182001  95.974026                0.182001          95.974026            1       True          6
5         ExtraTreesMSE  -1.627831       0.048000  10.059652                0.048000          10.059652            1       True          7
6         LightGBMLarge  -1



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -1.503179       0.274999  87.240847                0.001000           0.224068            2       True         12
1       NeuralNetFastAI  -1.515538       0.030000   3.026053                0.030000           3.026053            1       True          8
2        NeuralNetTorch  -1.523907       0.205998  19.651502                0.205998          19.651502            1       True         10
3              LightGBM  -1.525863       0.017000  14.324477                0.017000          14.324477            1       True          4
4            LightGBMXT  -1.536998       0.017002   5.943661                0.017002           5.943661            1       True          3
5              CatBoost  -1.541860       0.179001  87.163982                0.179001          87.163982            1       True          6
6         LightGBMLarge  -1



                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -1.637469       0.250001  153.781924                0.001000           0.223001            2       True         12
1              CatBoost  -1.645103       0.184001   98.471835                0.184001          98.471835            1       True          6
2            LightGBMXT  -1.666960       0.017999    5.304045                0.017999           5.304045            1       True          3
3       NeuralNetFastAI  -1.667346       0.027000    2.798080                0.027000           2.798080            1       True          8
4         LightGBMLarge  -1.686371       0.019000   40.175310                0.019000          40.175310            1       True         11
5              LightGBM  -1.691936       0.019000   12.113698                0.019000          12.113698            1       True          4
6         ExtraTrees



                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -0.745806       0.296025  164.495542                0.000000           0.220003            2       True         12
1              LightGBM  -0.779884       0.017001   12.595275                0.017001          12.595275            1       True          4
2              CatBoost  -0.780311       0.181002  107.650745                0.181002         107.650745            1       True          6
3       RandomForestMSE  -0.792656       0.048001   61.693693                0.048001          61.693693            1       True          5
4         ExtraTreesMSE  -0.793507       0.048000    9.056642                0.048000           9.056642            1       True          7
5        KNeighborsUnif  -0.794104       0.080022    0.672025                0.080022           0.672025            1       True          1
6        KNeighborsD



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -0.850798       0.138024  49.396889                0.000000           0.223000            2       True         12
1         LightGBMLarge  -0.856643       0.018000  35.408751                0.018000          35.408751            1       True         11
2              LightGBM  -0.858260       0.018001  10.579348                0.018001          10.579348            1       True          4
3              CatBoost  -0.862169       0.123001  74.989357                0.123001          74.989357            1       True          6
4            LightGBMXT  -0.864297       0.018000   4.859056                0.018000           4.859056            1       True          3
5       NeuralNetFastAI  -0.867792       0.023005   2.535788                0.023005           2.535788            1       True          8
6         ExtraTreesMSE  -0



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -0.749555       0.224000  23.440138                0.000000           0.223001            2       True         12
1        NeuralNetTorch  -0.764719       0.200001  20.655554                0.200001          20.655554            1       True         10
2       NeuralNetFastAI  -0.764961       0.023999   2.561583                0.023999           2.561583            1       True          8
3            LightGBMXT  -0.774412       0.018000   5.068881                0.018000           5.068881            1       True          3
4              CatBoost  -0.775554       0.123001  76.314495                0.123001          76.314495            1       True          6
5         LightGBMLarge  -0.781090       0.024002  37.019292                0.024002          37.019292            1       True         11
6              LightGBM  -0



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -0.486832       0.257002  71.866255                0.000000           0.224001            2       True         12
1              LightGBM  -0.490033       0.020000  10.389867                0.020000          10.389867            1       True          4
2            LightGBMXT  -0.494882       0.020000   4.818611                0.020000           4.818611            1       True          3
3       NeuralNetFastAI  -0.500353       0.024000   2.521036                0.024000           2.521036            1       True          8
4         LightGBMLarge  -0.501343       0.020001  34.614185                0.020001          34.614185            1       True         11
5              CatBoost  -0.502891       0.124002  75.491446                0.124002          75.491446            1       True          6
6         ExtraTreesMSE  -0



                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -3.941422       0.342020  99.434999                0.001000           0.223001            2       True         12
1        NeuralNetTorch  -4.069571       0.207002  22.963875                0.207002          22.963875            1       True         10
2         ExtraTreesMSE  -4.194486       0.049001   9.306245                0.049001           9.306245            1       True          7
3            LightGBMXT  -4.196229       0.020008   7.650127                0.020008           7.650127            1       True          3
4       RandomForestMSE  -4.206760       0.048001  49.799441                0.048001          49.799441            1       True          5
5              LightGBM  -4.229255       0.021013  11.880084                0.021013          11.880084            1       True          4
6              CatBoost  -4

KeyError: 'WeightedEnsemble_L2'

In [32]:
pd.DataFrame(valScoreReuslt).to_csv('modelValScore.csv')

In [31]:
skipped

['zebularine']

In [33]:
labels

['lovastatin',
 '1S3R-RSL-3',
 'daporinad',
 'docetaxel',
 'erlotinib',
 'JW-55',
 'selumetinib',
 'trametinib',
 'zebularine',
 'azacitidine',
 'nelarabine',
 'myricetin',
 'BRD-K64610608',
 'ML334 diastereomer',
 'BRD-K09344309',
 'isonicotinohydroxamic acid',
 'QS-11',
 'brivanib',
 'BRD8958',
 'BRD-K34099515',
 'A-804598',
 'erismodegib',
 'abiraterone',
 'ifosfamide',
 'temozolomide',
 'BRD-A05715709',
 'BRD-K48477130',
 'CAY10594',
 'WP1130',
 'tamoxifen',
 'importazole',
 'ML006',
 'AM-580',
 'CD-1530',
 'silmitasertib',
 'PRL-3 inhibitor I',
 'NPC-26',
 'betulinic acid',
 'salermide',
 'BRD-M00053801',
 'AA-COCF3',
 'CI-976',
 'pifithrin-alpha',
 'purmorphamine',
 'BIBR-1532',
 'niclosamide',
 'bardoxolone methyl',
 'hyperforin',
 'BRD4132',
 'regorafenib',
 'sorafenib',
 'BRD-K51490254',
 'EX-527',
 'bendamustine',
 'PYR-41',
 'B02',
 'ML031',
 'tipifarnib-P1',
 'BRD9876',
 'parthenolide',
 'tandutinib',
 'CID-5951923',
 'DBeQ',
 'LE-135',
 'BRD-K24690302',
 'ML203',
 'dexamet