In [1]:
import pandas as pd
import numpy as np

from scipy import stats


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


This notebook reads in all of the novel library predictions and averages them to find the final ensemble predictions

Inputs:
1.   RFR_encoding_dataset_path (String)
2.   KNR_encoding_dataset_path (String)
3.   MPNR_encoding_dataset_path (String)

Outputs:
ensemble_df
*   18 columned Dataframe, with a number of rows equivalent to the length of the novel variant library
*   Saved as Emsemble_in_silico_predictions in the parent folder

In [3]:
#COLAB ONLY
parent_folder_path = '/content/drive/MyDrive/ml_paper_ipnys/ensembling_folder/'

len_sequence_of_base_variant = 451 # len(Sequence of Base Variant)
dependent_variable = 'Fluor Decay' # Desired Name of Dependent Variable
comparison_variant = 'jGCaMP7s' #Variant For Optimization Name
rfr_encoding_dataset_path = '/content/drive/MyDrive/ml_paper_ipnys/ensembling_folder/KNR/KNR_Encoding_Datasets.csv'
mpnr_encoding_dataset_path = '/content/drive/MyDrive/ml_paper_ipnys/ensembling_folder/KNR/KNR_Encoding_Datasets.csv'
knr_encoding_dataset_path = '/content/drive/MyDrive/ml_paper_ipnys/ensembling_folder/KNR/KNR_Encoding_Datasets.csv'

In [9]:
#isolate top RFR encoding datasets
rfr_top_encoding_data = pd.read_csv(rfr_encoding_dataset_path,index_col= 0)
rfr_top_encoding_data = rfr_top_encoding_data.sort_values(by = 'Test Set R Squared',
                                                          ascending = False)
rfr_names = [e[-10:] for e in rfr_top_encoding_data['Encoding Dataset'][0:5]]

#isolate top MPNR encoding datasets
mpnr_top_encoding_data = pd.read_csv(mpnr_encoding_dataset_path,index_col= 0)
mpnr_top_encoding_data = mpnr_top_encoding_data.sort_values(by = 'Test Set R Squared',
                                                          ascending = False)
mpnr_names = [e[-10:] for e in mpnr_top_encoding_data['Encoding Dataset'][0:5]]

#isolate top KNR encoding datasets
knr_top_encoding_data = pd.read_csv(knr_encoding_dataset_path,index_col= 0)
knr_top_encoding_data = knr_top_encoding_data.sort_values(by = 'Test Set R Squared',
                                                          ascending = False)
knr_names = [e[-10:] for e in knr_top_encoding_data['Encoding Dataset'][0:5]]

dict_of_names = {'RFR': rfr_names,
                'MPNR': mpnr_names,
                'KNR':knr_names}

###Possible tests:
assert list(knr_top_encoding_data.columns) == ['Encoding Dataset', 'Test Set R Squared']
assert np.shape(knr_names) == (5,)
assert len(dict_of_names.keys()) == 3
assert knr_top_encoding_data['Test Set R Squared'].iloc[0]>=knr_top_encoding_data['Test Set R Squared'].iloc[1]

In [16]:
ensemble_df = pd.DataFrame()

for key in dict_of_names.keys():
    #initialize list of predictions
    predictions_list = []
    for predictions_file in dict_of_names[key]:
        #read in and sort data
        data = pd.read_csv(parent_folder_path+key+'/'+predictions_file+'_novel_library_predictions.csv',index_col = 0)
        data = data.sort_values(by = 'mutant_name')

        #one last duplicate values check
        index_cols = [str(e) for e in np.arange(0,len_sequence_of_base_variant)]
        short_data = data[index_cols]
        cleaned_data = data[short_data.duplicated() == False]
        cleaned_data = cleaned_data.sort_values(by = 'mutant_name', ascending = False)

        predictions_list.append(list(cleaned_data[dependent_variable +' Predicted']))


    #match the predictions_notebook_with_prediction_outputs &
    #save to dataframe
    keys_df = pd.DataFrame({key+'_'+dict_of_names[key][0]: predictions_list[0],
                            key+'_'+dict_of_names[key][1]: predictions_list[1],
                            key+'_'+dict_of_names[key][2]: predictions_list[2],
                            key+'_'+dict_of_names[key][3]: predictions_list[3],
                            key+'_'+dict_of_names[key][4]: predictions_list[4]})

    #save to exterior dataframe for full ensemble
    ensemble_df = pd.concat([ensemble_df, keys_df], axis =1)

    #derive model specific metrics for export
    data_columns = list(keys_df.columns)
    keys_df['Average Prediction'] = [np.mean(keys_df.iloc[e]) for e in range(len(keys_df))]
    keys_df['Mutation'] = list(cleaned_data['mutant_name'])
    comparison_list = list(keys_df[keys_df['Mutation'] == comparison_variant][data_columns].values[0])
    keys_df['P-Values'] = [stats.ttest_ind(comparison_list,list(keys_df[data_columns].iloc[e].values)).pvalue for e in range(len(keys_df))]

    #save dataframe for each model type
    keys_df = keys_df.sort_values(by = 'Average Prediction')
    keys_df.to_csv(parent_folder_path+key+'_in_silico_predictions_.csv')

#Derive ensemble metrics such as mean prediction and get pvalues
#compared to predictions for base construct
data_columns = list(ensemble_df.columns)
ensemble_df['Average Prediction'] = [np.mean(ensemble_df.iloc[e]) for e in range(len(ensemble_df))]
ensemble_df['Mutation'] = list(cleaned_data['mutant_name'])
comparison_list = list(ensemble_df[ensemble_df['Mutation'] == comparison_variant][data_columns].values[0])
ensemble_df['P-Values'] = [stats.ttest_ind(comparison_list,list(ensemble_df[data_columns].iloc[e].values)).pvalue for e in range(len(ensemble_df))]

#export data to csv for downstream usage
ensemble_df = ensemble_df.sort_values(by = 'Average Prediction')
ensemble_df.to_csv(parent_folder_path+'Ensemble_insilico_predictions_.csv')

ensemble_df

###Possible tests:
assert len(keys_df.columns) == 8
assert len(ensemble_df.columns) == 18
assert ensemble_df[ensemble_df.columns[0:15]].iloc[0].mean() == ensemble_df['Average Prediction'].iloc[0]


Unnamed: 0,RFR_BASU050103,RFR_HUTJ700101,RFR_ISOY800107,RFR_GEIM800104,RFR_ZIMJ680103,MPNR_BASU050103,MPNR_HUTJ700101,MPNR_ISOY800107,MPNR_GEIM800104,MPNR_ZIMJ680103,KNR_BASU050103,KNR_HUTJ700101,KNR_ISOY800107,KNR_GEIM800104,KNR_ZIMJ680103,Average Prediction,Mutation,P-Values
293,0.755370,0.914070,0.797874,0.977849,0.994023,0.696782,-10.593789,0.700212,-1.054570,0.081846,0.33,0.356322,0.278345,1.112857,0.248644,-0.226944,jGCaMP7s R381M,0.289091
285,0.755370,0.914070,0.797874,0.977849,0.994023,0.651767,-8.394299,0.489021,-0.943936,0.110084,0.33,0.312500,0.248644,0.457931,0.248644,-0.136697,jGCaMP7s R381Y,0.290127
286,0.755370,0.914070,0.797874,0.977849,0.994023,0.669774,-9.054778,0.339786,-1.015502,0.228306,0.33,0.312500,0.682000,1.112857,0.248644,-0.113815,jGCaMP7s R381W,0.308134
294,0.755370,0.914070,0.797874,0.977849,0.994023,0.602569,-8.002181,0.427370,-1.015502,0.044437,0.33,0.312500,0.248644,1.112857,0.248644,-0.083432,jGCaMP7s R381L,0.305702
299,0.755370,0.914070,0.797874,0.977849,0.994023,0.627539,-8.054110,0.593126,-0.985060,0.042466,0.33,0.312500,0.248644,1.112857,0.248644,-0.072281,jGCaMP7s R381F,0.310674
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1420,0.965737,1.035065,0.968000,0.977849,1.504600,0.068117,-6.321201,-0.485907,-1.757471,31.159523,1.03,1.922500,0.682000,0.687143,1.300000,2.249064,jGCaMP7s A390E,0.657120
1421,0.965737,1.035065,0.968000,0.977849,1.497600,0.318286,-6.143229,-0.110614,-0.778847,31.147745,1.03,0.620546,0.682000,1.484286,1.300000,2.332962,jGCaMP7s A390D,0.630343
1409,0.965737,1.035065,0.968000,1.103515,1.589200,0.265121,-5.627086,-0.579696,-0.657028,31.294495,1.03,0.620546,0.682000,1.484286,1.300000,2.364944,jGCaMP7s A390R,0.621139
1415,0.965737,1.420000,1.189000,0.977849,1.467600,0.244716,-6.983726,0.353337,-1.116718,31.135967,1.03,2.435000,1.440000,0.687143,1.300000,2.436394,jGCaMP7s A390K,0.601687
