In [15]:
#Running this shell will install/import necessary libraries to run the pipeline.


# MAIN PACKAGES
import pandas as pd    
import numpy as np
import sklearn
from tqdm import tqdm

# MODEL VALIDATION 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

#RFR SPECIFIC
from sklearn.ensemble import RandomForestRegressor


def dataset_encoding(encoding_data, AA_property_dataset, x_train, x_test, x_data):
    
    """takes in the name of the property, checks for NaNs, if no 
    Nans exist it encodes the x_train, x_test, and full x_data
    outputs:
        Boolean T/F for NaN existence,
        property_encoding_data: dataframe with AA values and encoding information,
        encoded_x_train: dataframe with encoded x_train data,
        encoded_x_test: dataframe with encoded x_test data,
        encoded_x_data: dataframe with encoded x_data
        """
    AA_property_id = AA_property_dataset[-10:]

    property_encoding_data = pd.DataFrame(
        {'Amino Acid Code': encoding_data['Amino Acid Code'], 
         AA_property_id: encoding_data[AA_property_dataset]})
    
    #check to make sure that all amino acids have encoding information
    T = np.isnan(list(property_encoding_data[AA_property_id]))
    encoded_x_train = x_train.copy()
    encoded_x_test = x_test.copy()
    encoded_x_data = x_data.copy()

    if True not in T:
        #encoding_each_position
        for row, sample in enumerate(property_encoding_data['Amino Acid Code']):
            amino = sample
            replacement_value = float(property_encoding_data[AA_property_id].iloc[row])
            encoded_x_train = encoded_x_train.replace(amino,replacement_value)
            encoded_x_test = encoded_x_test.replace(amino,replacement_value)
            encoded_x_data = encoded_x_data.replace(amino,replacement_value)
    return (True not in T), property_encoding_data, encoded_x_train, encoded_x_test, encoded_x_data

In [18]:
#parameters
variant_dataset_path = 'combined_dataset.csv' #@param {type:"string"}
encoding_dataset_path = 'full_property_matrix.csv' #@param {type:"string"}
novel_library_path = 
independent_variable_columns = [str(i) for i in np.arange(0,451)] #@param {type:"list"}
dependent_variable_columns = '1 AP ∆F/F0' #@param {type:"string"}
chosen_random_state = 42 #@param {type:"raw"}


In [19]:
# Train & Test Split The Data
#read in data and designate X/Y data
data = pd.read_csv(variant_dataset_path,index_col = 0)
encoding_data = pd.read_csv(encoding_dataset_path, index_col = 0)
x_data = data[independent_variable_columns]
y_data = data[dependent_variable_columns]


#train and test splitting
x_train, x_test, y_train, y_test = train_test_split(x_data, 
                                                    y_data, 
                                                    test_size=0.20, 
                                                    random_state=chosen_random_state)


In [None]:
#RFR
RFR_output_df = pd.DataFrame()
for AA_property_dataset in tqdm(encoding_data.columns[1:], desc = 'Property Datasets Encoded:'):

  #dataset encoding, function defintion found in set-up cell
    check, property_encoding_data, encoded_x_train, encoded_x_test, encoded_x_data = dataset_encoding(encoding_data, 
                                                                                                    AA_property_dataset, 
                                                                                                    x_train, x_test,
                                                                                                    x_data)
    #check for no NaNs in encoding dataset
    if check == True:
        model = RandomForestRegressor(random_state = 42)
        model.fit(encoded_x_train,y_train)
        feat_importances = pd.Series(model.feature_importances_, 
                                     index=encoded_x_train.columns)
    
        #initialize external saving location
        n_est = []
        test_r2s = []

        #for features in range 21
        for l in range(21): 
            if l > 0:
                #isolate the l greatest feature importances in each encoded dataset
                cols = list(feat_importances.nlargest(l).index)
                x_train_copy_ = encoded_x_train[cols]
                x_test_copy_ = encoded_x_test[cols]
                
                #intialize list of estimators to test
                n_estimators = [10, 15, 20, 25, 30, 35, 40, 
                                45, 50, 55, 65, 75, 85, 100]
                
                #initialize external saving location
                n_mse_list = []
                n_r2_list = []
                
                #iterate through estimators to test estimators + feature...
                #predictive capabilities
                for estimators in n_estimators: 
                    clf_RF = RandomForestRegressor(n_estimators=estimators, 
                                                   random_state=42)
                    clf_RF.fit(x_train_copy_, y_train)
                    y_RF = clf_RF.predict(x_test_copy_)
                    n_mse_list.append(mean_squared_error(y_test, y_RF))
                    n_r2_list.append(sklearn.metrics.r2_score(y_test, y_RF))

                best_est = n_estimators[n_mse_list.index(min(n_mse_list))]        
                n_est.append(best_est)
                test_r2s.append(np.mean(n_r2_list))

        #determine best feature from hyperparameter tuning
        best_feats = test_r2s.index(max(test_r2s))
        est_best = n_est[best_feats]

        #isolate important features in both train/test sets
        cols = list(feat_importances.nlargest(best_feats).index)
        encoded_x_train = encoded_x_train[cols]
        encoded_x_test = encoded_x_test[cols]


        #model initialization with best parameters
        rfr = RandomForestRegressor(n_estimators = est_best, random_state = 42)

        #fitting with important features data
        rfr.fit(encoded_x_train, y_train)

        #predictions on test and train sets
        y_test_predicted = rfr.predict(encoded_x_test)
        y_train_predicted = rfr.predict(encoded_x_train)

        #output_metrics 
        r_squared = np.round(sklearn.metrics.r2_score(y_test, y_test_predicted),2)
        r_squared_validation = np.round(sklearn.metrics.r2_score(y_train, y_train_predicted),2)
        mse_ = np.round(mean_squared_error(y_test, y_test_predicted),2)

        #saving to output matrix
        iter_dict = {'Dataset Name': property_encoding_data.columns[1],
                    'Test Set R Sqaured': r_squared,
                    'Train Set R Squared': r_squared_validation,
                    'MSE': mse_}
        RFR_output_df = RFR_output_df.append(iter_dict, ignore_index = True)

RFR_output_df =RFR_output_df.sort_values(by = 'Test Set R Sqaured', ascending = False)

RFR_encoded_data_path = 'Random_Forest_Regressor_encoding_dataset_performance.csv'

RFR_output_df.to_csv(RFR_encoded_data_path)

print('File Saved to: ' + RFR_encoded_data_path)

RFR_output_df.head(5)

Property Datasets Encoded::  19%|█▊        | 105/566 [47:21<3:24:29, 26.62s/it]