# Concat feature importances

Running xgboost multiple times results in different sets of feature importances (all uniquely saved in feature_importances), probably due to the randomness and greedyness of the algorithm. However, some feaures will be important in multiple runs, and these are the features that I think are the most important.

So this notebook concatenates multiple feature_importance files into one file that has information about the feature importances and the number of times a feature was found to be important


In [3]:
# Want to combine into a dataframe with columns: feature, importances (a list), count (yes count is just the length of importances but it might be handy to just have this as a redundant column)

cancer_types = ['BLCA', 'BRCA', 'COAD', 'ESCA', 'HNSC', 'KIRC', 'KIRP', 'LIHC', 'LUAD', 'LUSC', 'PRAD', 'THCA', 'UCEC']
# cancer_type = '' # for multiclass

for cancer_type in cancer_types:
    # read in feature importance files
    import os
    file_names = os.listdir('feature_importances/')
    file_names = [file for file in file_names if cancer_type+'_run_' in file]
    print("Reading in files " , file_names)

    import pandas as pd
    all_importances = pd.DataFrame(columns=['feature', 'importances', 'count'])

    # go through all files and add importances to all_importances
    for file_name in file_names:
        with open('feature_importances/'+file_name) as file:
            lines = file.readlines()
        for line in lines:
            values = line.split(',')

            # get feature and importance
            feature = int(values[0].split(':')[1][1:])
            importance = float(values[1].split(':')[1][1:])

            # add to pandas df
    #         print("feature ", feature, " in all importances: ", feature in all_importances["feature"].values)
            if feature in all_importances["feature"].values: 
                # add to list and increase count
                row_index = all_importances.loc[all_importances['feature'] == feature].index.values[0] # find row index with this feature
                all_importances["importances"][row_index] = all_importances["importances"][row_index] + [importance] # add importances to list
                all_importances["count"][row_index] = all_importances["count"][row_index] + 1 # add 1 to count
            else:
                all_importances = all_importances.append({"feature":feature, "importances":[importance], "count":1}, ignore_index=True)

    all_importances = all_importances.sort_values(by = 'count', ascending = False)
    print(all_importances)

    # save file
    all_importances.to_csv('feature_importances/'+cancer_type+'_concatenated.csv', index=False)

Reading in files  ['BLCA_run_3.csv', 'BLCA_run_6.csv', 'BLCA_run_0.csv', 'BLCA_run_4.csv', 'BLCA_run_5.csv', 'BLCA_run_7.csv', 'BLCA_run_1.csv', 'BLCA_run_2.csv']
    feature                          importances count
17   215686  [0.04211, 0.0147, 0.01773, 0.02576]     4
9     77064  [0.03419, 0.04288, 0.0522, 0.00713]     4
16   185000           [0.1398, 0.05291, 0.05652]     3
54   230575             [0.1133, 0.0929, 0.3074]     3
47   114530          [0.06221, 0.07588, 0.17003]     3
..      ...                                  ...   ...
56        1                            [0.00263]     1
53   206040                            [0.02492]     1
51   158344                            [0.00714]     1
50   154647                            [0.00331]     1
153  244282                            [0.08188]     1

[154 rows x 3 columns]
Reading in files  ['BRCA_run_0.csv', 'BRCA_run_1.csv', 'BRCA_run_3.csv', 'BRCA_run_5.csv', 'BRCA_run_7.csv', 'BRCA_run_6.csv', 'BRCA_run_4.csv', 'BRCA_ru