In [2]:
#Package Imports
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import sklearn
from sklearn.metrics import r2_score

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


This notebook reads in all of the test-set predictions and averages them to find the final ensemble predictions

Inputs:
1.   RFR_encoding_dataset_path (String)
2.   KNR_encoding_dataset_path (String)
3.   MPNR_encoding_dataset_path (String)

Outputs:
csv of R-squared values   

In [1]:
#COLAB ONLY
parent_folder_path = '/content/drive/MyDrive/ml_paper_ipnys/ensembling_folder/'

len_sequence_of_base_variant = 451 # len(Sequence of Base Variant)
dependent_variable = 'Fluor Decay' # Desired Name of Dependent Variable
comparison_variant = 'jGCaMP7s' #Variant For Optimization Name
rfr_encoding_dataset_path = '/content/drive/MyDrive/ml_paper_ipnys/ensembling_folder/KNR/KNR_Encoding_Datasets.csv' #these should change and go to RFR/MPNR
mpnr_encoding_dataset_path = '/content/drive/MyDrive/ml_paper_ipnys/ensembling_folder/KNR/KNR_Encoding_Datasets.csv'
knr_encoding_dataset_path = '/content/drive/MyDrive/ml_paper_ipnys/ensembling_folder/KNR/KNR_Encoding_Datasets.csv'

In [4]:
#isolate top RFR encoding datasets
rfr_top_encoding_data = pd.read_csv(rfr_encoding_dataset_path,index_col= 0)
rfr_top_encoding_data = rfr_top_encoding_data.sort_values(by = 'Test Set R Squared',
                                                          ascending = False)
rfr_names = [e[-10:] for e in rfr_top_encoding_data['Encoding Dataset'][0:5]]

#isolate top MPNR encoding datasets
mpnr_top_encoding_data = pd.read_csv(mpnr_encoding_dataset_path,index_col= 0)
mpnr_top_encoding_data = mpnr_top_encoding_data.sort_values(by = 'Test Set R Squared',
                                                          ascending = False)
mpnr_names = [e[-10:] for e in mpnr_top_encoding_data['Encoding Dataset'][0:5]]

#isolate top KNR encoding datasets
knr_top_encoding_data = pd.read_csv(knr_encoding_dataset_path,index_col= 0)
knr_top_encoding_data = knr_top_encoding_data.sort_values(by = 'Test Set R Squared',
                                                          ascending = False)
knr_names = [e[-10:] for e in knr_top_encoding_data['Encoding Dataset'][0:5]]

dict_of_names = {'RFR': rfr_names,
                'MPNR': mpnr_names,
                'KNR':knr_names}

###Possible tests:
assert list(knr_top_encoding_data.columns) == ['Encoding Dataset', 'Test Set R Squared']
assert np.shape(knr_names) == (5,)
assert len(dict_of_names.keys()) == 3
assert knr_top_encoding_data['Test Set R Squared'].iloc[0]>=knr_top_encoding_data['Test Set R Squared'].iloc[1]

In [22]:
ensemble_df = pd.DataFrame()
r2_list = []
column_list = []

for key in dict_of_names.keys():
    #initialize list of predictions
    predictions_list = []
    for predictions_file in dict_of_names[key]:
        #read in and sort data
        data = pd.read_csv(parent_folder_path+key+'/'+predictions_file+'_test_set_predictions.csv',index_col = 0)
        predictions_list.append(list(data['Predicted']))


    #match the predictions_notebook_with_prediction_outputs &
    #save to dataframe
    keys_df = pd.DataFrame({key+'_'+dict_of_names[key][0]: predictions_list[0],
                            key+'_'+dict_of_names[key][1]: predictions_list[1],
                            key+'_'+dict_of_names[key][2]: predictions_list[2],
                            key+'_'+dict_of_names[key][3]: predictions_list[3],
                            key+'_'+dict_of_names[key][4]: predictions_list[4]})

    #save to exterior dataframe for full ensemble
    ensemble_df = pd.concat([ensemble_df, keys_df], axis =1)

    #derive model specific metrics for export
    data_columns = list(keys_df.columns)
    keys_df[key+' Average Prediction'] = [np.mean(keys_df.iloc[e]) for e in range(len(keys_df))]
    keys_df['True'] = list(data['True'])

    for column in keys_df.columns[:-1]:
      x = keys_df[column]
      y = keys_df['True']
      column_list.append(column)
      r2_list.append(sklearn.metrics.r2_score(x, y))


#derive model specific metrics for export
x = [np.mean(ensemble_df.iloc[e]) for e in range(len(ensemble_df))]
y = list(data['True'])
column_list.append('Ensemble')
r2_list.append(sklearn.metrics.r2_score(x, y))

r2_df = pd.DataFrame({'Regressor + Encoding Dataset': column_list,
                      'R2 Performance Score': r2_list})

r2_df.to_csv(parent_folder_path+'Cross_Validation_Scores.csv')


#TESTS
assert len(r2_df) == 19
assert type(r2_df['Regressor + Encoding Dataset'].iloc[0]) == str
assert type(r2_df['R2 Performance Score'].iloc[0]) == np.float64

   Regressor + Encoding Dataset  R2 Performance Score
0                RFR_BASU050103              0.857457
1                RFR_HUTJ700101              0.828743
2                RFR_ISOY800107              0.800169
3                RFR_GEIM800104              0.839010
4                RFR_ZIMJ680103              0.847357
5        RFR Average Prediction              0.850776
6               MPNR_BASU050103              0.620961
7               MPNR_HUTJ700101              0.622274
8               MPNR_ISOY800107              0.625398
9               MPNR_GEIM800104              0.579831
10              MPNR_ZIMJ680103              0.466075
11      MPNR Average Prediction              0.617294
12               KNR_BASU050103              0.621664
13               KNR_HUTJ700101              0.678125
14               KNR_ISOY800107              0.606756
15               KNR_GEIM800104              0.587165
16               KNR_ZIMJ680103              0.554200
17       KNR Average Predict