In [1]:
# MAIN PACKAGES
import pandas as pd
import numpy as np
import sklearn
from tqdm import tqdm

# MODEL VALIDATION
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

#MPNR SPECIFIC
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.preprocessing import RobustScaler



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


This notebook trains + tests each encoding library to find the best perfoming encoding amino acid property matricies. These Matricies will be used to generate final predictions on witheld test set and novel mutant library.


Inputs:
1.   Combined Dataset Path (String)
2.   Desired Name of Dependent Variable (String)
3.   len(Sequence of Base Variant) (int)
4.   Property Matrix Path (String)

Outputs:
df_MPNR
*   Dataframe (2 columns: Encoding Dataset Column + R Squared Achieved Column)
*   Saved as MPNR_Encoding_Datasets.csv in the MPNR/ Sub Folder


In [2]:
mutation_datset_path = '/content/drive/MyDrive/ml_paper_ipnys/backend_data/combined_dataset.csv'
len_sequence_of_base_variant = 451 # len(Sequence of Base Variant)
dependent_variable = 'Fluor Decay' # Desired Name of Dependent Variable
encoding_dataset_path = '/content/drive/MyDrive/ml_paper_ipnys/backend_data/full_property_matrix.csv'
parent_folder_path = '2023-07-22-ensemble-run/' #should be created in wrapper

In [7]:
data = pd.read_csv(mutation_datset_path)
position_cols = np.arange(0,len_sequence_of_base_variant)
position_cols = [str(i) for i in position_cols]
encoded_df = data[position_cols]

x_train, x_test, y_train, y_test = train_test_split(encoded_df,
                                                    data[dependent_variable],
                                                    test_size=0.20,
                                                    random_state=42)

In [8]:
# Read in encoding dataset
encoding_data = pd.read_csv(encoding_dataset_path, index_col = 0)

In [11]:
#initialize output dataframe
df_MPNR = pd.DataFrame()

#Iterate through every encoding dataset + train model +
#record performance
for AA_property_dataset in tqdm(encoding_data.columns[1:],
                                desc = 'Property Datasets Encoded:'):

    # Make the train/test be on a copy to ensure there
    #is no data overwriting within loop
    x_train_copy = x_train.copy()
    x_test_copy = x_test.copy()
    x_full = encoded_df.copy()

    #extract encoding data for specific iteration
    volume_dict = {'Amino Acid Code': encoding_data[encoding_data.columns[0]],
                   AA_property_dataset: encoding_data[AA_property_dataset]}
    volume_data = pd.DataFrame(volume_dict)


    #Some encoding datasets contain NaNs, I skip these
    #datasets since theyre incomplete
    #Next three lines check the encoding data for NaNs
    df = list(volume_data.iloc[:,1].values)
    T = np.isnan(df)
    TF = True in T

    #If there is no NaN, perform model training
    if TF == False:

        #initialize list to append to throughout training
        interlist = []

        #Use volume_data as a codex to translate sequence data...
        # amino acids will translate to float type data
        col_title = volume_data.columns[1]
        for row, sample in enumerate(volume_data['Amino Acid Code']):
            amino = sample
            replacement_value = float(volume_data[col_title].iloc[row])
            x_train_copy = x_train_copy.replace(amino,replacement_value)
            x_test_copy = x_test_copy.replace(amino,replacement_value)
            x_full = x_full.replace(amino,replacement_value)

        #Data Scaling
        scaler = RobustScaler()
        scaler.fit(x_full)
        x_train_copy = scaler.transform(x_train_copy)
        x_test_copy = scaler.transform(x_test_copy)

        x_train_copy = pd.DataFrame(x_train_copy, columns = x_full.columns)
        x_test_copy = pd.DataFrame(x_test_copy, columns = x_full.columns)


        #apply SelectKBest class to extract best features
        bestfeatures = SelectKBest(score_func=mutual_info_regression, k='all')
        fit = bestfeatures.fit(x_train_copy,y_train)
        dfscores = pd.DataFrame(fit.scores_)
        dfcolumns = pd.DataFrame(x_train_copy.columns)

        #concat two dataframes for better visualization
        featureScores = pd.concat([dfcolumns,dfscores],axis=1)
        featureScores.columns = ['Specs','Score']  #naming the dataframe columns


        ##Hyper Parameter Tuning
        #Grid Search approach, I test every iteration to find the best
        # number of features

        #Initialize the output lists to append to
        test_r2s = []

        #For 25 possible best features
        for l in range(25):

            if l > 0:
                cols = featureScores.nlargest(l,'Score')

                #extract l features from X_train/X_test
                x_train_copy_ = x_train_copy[list(cols['Specs'].values)]
                x_test_copy_ = x_test_copy[list(cols['Specs'].values)]

                #initialize model
                clf_RF = MLPRegressor(random_state = 42)
                #Fit the Train data
                clf_RF.fit(x_train_copy_, y_train)
                #Predict the Test set and generate metrics of fit
                y_RF = clf_RF.predict(x_test_copy_)
                test_r2s.append(sklearn.metrics.r2_score(y_test, y_RF))

        #find which number of feature led to the
        #max R2, skip the zeroith index
        best_feats = test_r2s.index(max(test_r2s))


        #Extract the features that led to the best performance
        cols = featureScores.nlargest(best_feats+1,'Score')
        x_train_copy_ = x_train_copy[list(cols['Specs'].values)]
        x_test_copy_ = x_test_copy[list(cols['Specs'].values)]

        #initialize new model + fit data
        clf = MLPRegressor(random_state = 42)
        clf.fit(x_train_copy_, y_train)

        #Create cross validation prediction
        y_pred = clf.predict(x_test_copy_)

        #Save the overall R2 for the tuned model
        r2 = sklearn.metrics.r2_score(y_test, y_pred)

        #append the data to a dataframe for export
        inter_df= pd.DataFrame({'Encoding Dataset': [AA_property_dataset[-11:]],
                                'Test Set R Squared' : [r2]})

        df_MPNR = pd.concat([df_MPNR,inter_df],ignore_index = True)

df_MPNR = df_MPNR.sort_values(by='Test Set R Squared', ascending = False)
df_MPNR.to_csv(parent_folder_path +'MPNR/MPNR_Encoding_Datasets.csv')

assert len(df_MPNR) == 553
assert df_MPNR['Test Set R Squared'].iloc[0] >= df_MPNR['Test Set R Squared'].iloc[1]

Property Datasets Encoded:: 100%|██████████| 566/566 [3:35:06<00:00, 22.80s/it]


In [None]:
##FOR COLAB ONLY
df_MPNR.to_csv('/content/drive/MyDrive/ml_paper_ipnys/MPNR_Encoding_Datasets.csv')