In [218]:
import tqdm
import multiprocessing
import pandas as pd
import numpy as np
import scipy.stats

from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

In [28]:
def load_representation(multi_col_representation_vector_file_path):
    multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
    vals = multi_col_representation_vector.iloc[:,1:(len(multi_col_representation_vector.columns))]
    original_values_as_df = pd.DataFrame({'PDB_ID': pd.Series([], dtype='str'),'Vector': pd.Series([], dtype='object')})
    for index, row in tqdm.tqdm(vals.iterrows(), total = len(vals)):
        list_of_floats = [float(item) for item in list(row)]
        original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['PDB_ID']] + [list_of_floats]
    return original_values_as_df

In [6]:
ppi_affinity_file = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/generalized_representation_benchmark/DATA/auxilary_input/skempi_pipr/SKEMPI_all_dg_avg.txt"

In [11]:
ppi_affinity_df = pd.read_csv(ppi_affinity_file,sep="\t",header=None)

In [13]:
ppi_affinity_df.columns = ['Protein1', 'Protein2', 'Affinity']

In [14]:
ppi_affinity_df

Unnamed: 0,Protein1,Protein2,Affinity
0,1A22_A_wt,1A22_B_CB308A_,-12.293700
1,1A22_A_wt,1A22_B_CB322A_,-12.293700
2,1A22_A_wt,1A22_B_DB319A_EB320A_KB321A_,-12.604200
3,1A22_A_wt,1A22_B_DB326A_,-11.680933
4,1A22_A_wt,1A22_B_DB332A_,-11.728800
...,...,...,...
2945,4CPA_A_wt,4CPA_I_VI38I_,-11.832000
2946,4CPA_A_wt,4CPA_I_VI38L_,-10.752500
2947,4CPA_A_wt,4CPA_I_wt,-12.031300
2948,4CPA_A_wt,4CPA_I_YI37F_,-12.031300


In [29]:
skempi_vectors_path = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/generalized_representation_benchmark\
/DATA/representation_vectors/skempi/"

In [30]:
seqvec_skempi_vectors_df = load_representation(skempi_vectors_path+"skempi_seqvec_representation_multi_col.csv")

100%|██████████| 2882/2882 [00:05<00:00, 500.56it/s]


In [37]:
seqvec_skempi_vectors_df[]

Unnamed: 0,PDB_ID,Vector
0,1CSE_E_wt,"[0.0179721936583518, 0.2265600860118866, -0.27..."
1,1CSE_I_wt,"[-0.0347585491836071, -0.0211586467921733, -0...."
2,1CSE_I_LI45G_,"[-0.0367474555969238, 0.0036459327675402, 0.01..."
3,1CSE_I_LI45S_,"[-0.040347870439291, 0.0095908511430025, 0.003..."
4,1CSE_I_LI45P_,"[-0.0497552566230297, 0.0098012648522853, 0.01..."
...,...,...
2877,1PPF_I_EI10D_AI15V_TI17S_EI19D_RI21M_KI29T_GI3...,"[0.0380711629986763, -0.0606547407805919, -0.0..."
2878,1PPF_I_EI10D_KI13R_AI15V_EI19D_RI21M_GI32S_EI4...,"[0.047196764498949, -0.0862331539392471, -0.00..."
2879,1PPF_I_EI10D_AI15V_EI19D_RI21M_GI32S_EI43D_LI4...,"[0.0282607041299343, -0.0843535810708999, -0.0..."
2880,1PPF_I_EI10D_LI18F_RI21M_EI43D_,"[0.0237759444862604, -0.0592682249844074, -0.0..."


## Calculate vector element-wise multiplication as described in https://academic.oup.com/bioinformatics/article/35/14/i305/5529260 

In [52]:
multiplied_vectors = pd.DataFrame({'Protein1': pd.Series([], dtype='str'),\
                                   'Protein2': pd.Series([], dtype='str'),\
                                   'Vector': pd.Series([], dtype='object')}) 

for index,row in tqdm.tqdm(ppi_affinity_df.iterrows()):
    vec1 = list(seqvec_skempi_vectors_df[seqvec_skempi_vectors_df['PDB_ID']\
                                         == row['Protein1']]['Vector'])[0]
    vec2 = list(seqvec_skempi_vectors_df[seqvec_skempi_vectors_df['PDB_ID']\
                                         == row['Protein2']]['Vector'])[0]
    multiplied_vec = np.multiply(vec1,vec2)
    
    multiplied_vectors = multiplied_vectors.\
        append({'Protein1':row['Protein1'], 'Protein2':row['Protein2'],\
                'Vector':multiplied_vec},ignore_index = True)
    

2950it [00:07, 377.75it/s]


In [53]:
multiplied_vectors

Unnamed: 0,Protein1,Protein2,Vector
0,1A22_A_wt,1A22_B_CB308A_,"[0.003311354625695531, -0.0014524045381846507,..."
1,1A22_A_wt,1A22_B_CB322A_,"[0.0033570338621778852, -0.0015080873739210482..."
2,1A22_A_wt,1A22_B_DB319A_EB320A_KB321A_,"[0.003373698050940003, -0.0013151702023098653,..."
3,1A22_A_wt,1A22_B_DB326A_,"[0.003245372387730002, -0.0013071223072096108,..."
4,1A22_A_wt,1A22_B_DB332A_,"[0.003325633954721621, -0.0014811012496847274,..."
...,...,...,...
2945,4CPA_A_wt,4CPA_I_VI38I_,"[0.013766766144766565, 0.009967618119868227, -..."
2946,4CPA_A_wt,4CPA_I_VI38L_,"[0.01426457111607936, 0.00941914513678287, -0...."
2947,4CPA_A_wt,4CPA_I_wt,"[0.014388073737252038, 0.009558309155660128, -..."
2948,4CPA_A_wt,4CPA_I_YI37F_,"[0.01331511961455699, 0.009569607259185531, -0..."


In [179]:
def calc_train_error(X_train, y_train, model):
    '''returns in-sample error for already fit model.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    mae = mean_absolute_error(y_train, predictions)
    corr = scipy.stats.pearsonr(y_train, predictions)
    return mse,mae,corr
    
def calc_validation_error(X_test, y_test, model):
    '''returns out-of-sample error for already fit model.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    corr = scipy.stats.pearsonr(y_test, predictions)
    return mse,mae,corr
    
def calc_metrics(X_train, y_train, X_test, y_test, model):
    '''fits model and returns the metrics for in-sample error and out-of-sample error'''
    model.fit(X_train, y_train)
    train_mse_error,train_mae_error,train_corr = calc_train_error(X_train, y_train, model)
    val_mse_error,val_mae_error,val_corr = calc_validation_error(X_test, y_test, model)
    return train_mse_error, val_mse_error, train_mae_error, val_mae_error,train_corr,val_corr

In [259]:
l = [1,2,3]


[100, 200, 300]

In [260]:
def report_results(
    train_mse_error_list,
    validation_mse_error_list,
    train_mae_error_list,
    validation_mae_error_list,
    train_corr_list,
    validation_corr_list,
    train_corr_pval_list,
    validation_corr_pval_list,
):
    result_df = pd.DataFrame(
        {
            "train_mse_error": round(np.mean(train_mse_error_list) * 100, 4),
            "train_mse_std": round(np.std(train_mse_error_list) * 100, 4),
            "val_mse_error": round(np.mean(validation_mse_error_list) * 100, 4),
            "val_mse_std": round(np.std(validation_mse_error_list) * 100, 4),
            "train_mae_error": round(np.mean(train_mae_error_list) * 100, 4),
            "train_mae_std": round(np.std(train_mae_error_list) * 100, 4),
            "val_mae_error": round(np.mean(validation_mae_error_list) * 100, 4),
            "val_mae_std": round(np.std(validation_mae_error_list) * 100, 4),
            "train_corr": round(np.mean(train_corr_list), 4),
            "train_corr_pval": round(np.mean(train_corr_pval_list), 4),
            "validation_corr": round(np.mean(validation_corr_list), 4),
            "validation_corr_pval": round(np.mean(validation_corr_pval_list), 4),
        },
        index=[0],
    )

    result_detail_df = pd.DataFrame(
        {
            "train_mse_errors": list(np.multiply(train_mse_error_list, 100)),
            "val_mse_errors": list(np.multiply(validation_mse_error_list, 100)),
            "train_mae_errors": list(np.multiply(train_mae_error_list, 100)),
            "val_mae_errors": list(np.multiply(validation_mae_error_list, 100)),
            "train_corrs": list(np.multiply(train_corr_list, 100)),
            "train_corr_pvals": list(np.multiply(train_corr_pval_list, 100)),
            "validation_corr": list(np.multiply(validation_corr_list, 100)),
            "validation_corr_pval": list(np.multiply(validation_corr_pval_list, 100)),
        },
        index=range(len(train_mse_error_list)),
    )
    return result_df, result_detail_df


In [223]:
def predictAffinityWithModel(regressor_model):
    K = 10
    kf = KFold(n_splits=K, shuffle=True, random_state=42)

    train_mse_error_list = []
    validation_mse_error_list = []
    train_mae_error_list = []
    validation_mae_error_list = []
    train_corr_list = []
    validation_corr_list = []
    train_corr_pval_list = []
    validation_corr_pval_list = []

    data = np.array(np.asarray(multiplied_vectors["Vector"].tolist()), dtype=float)
    target = np.array(ppi_affinity_df["Affinity"])
    scaler = MinMaxScaler()
    scaler.fit(target.reshape(-1, 1))
    target = scaler.transform(target.reshape(-1, 1))[:, 0]
    for train_index, val_index in tqdm.tqdm(kf.split(data, target), total=K):

        # split data
        X_train, X_val = data[train_index], data[val_index]
        y_train, y_val = target[train_index], target[val_index]

        # instantiate model
        reg = regressor_model #linear_model.BayesianRidge()

        # calculate error_list
        (
            train_mse_error,
            val_mse_error,
            train_mae_error,
            val_mae_error,
            train_corr,
            val_corr,
        ) = calc_metrics(X_train, y_train, X_val, y_val, reg)

        # append to appropriate list
        train_mse_error_list.append(train_mse_error)
        validation_mse_error_list.append(val_mse_error)

        train_mae_error_list.append(train_mae_error)
        validation_mae_error_list.append(val_mae_error)

        train_corr_list.append(train_corr[0])
        validation_corr_list.append(val_corr[0])

        train_corr_pval_list.append(train_corr[1])
        validation_corr_pval_list.append(val_corr[1])

    return report_results(
        train_mse_error_list,
        validation_mse_error_list,
        train_mae_error_list,
        validation_mae_error_list,
        train_corr_list,
        validation_corr_list,
        train_corr_pval_list,
        validation_corr_pval_list,
    )

In [262]:
model = linear_model.BayesianRidge()
result_df, result_detail_df = predictAffinityWithModel(model)

100%|██████████| 10/10 [00:10<00:00,  1.06s/it]


In [263]:
result_df

Unnamed: 0,train_mse_error,train_mse_std,val_mse_error,val_mse_std,train_mae_error,train_mae_std,val_mae_error,val_mae_std,train_corr,train_corr_pval,validation_corr,validation_corr_pval
0,0.3286,0.0058,0.5331,0.0521,4.1118,0.036,5.2423,0.1832,0.928,0.0,0.8801,0.0


In [264]:
result_detail_df

Unnamed: 0,train_mse_errors,val_mse_errors,train_mae_errors,val_mae_errors,train_corrs,train_corr_pvals,validation_corr,validation_corr_pval
0,0.333165,0.485027,4.132699,5.18361,92.750737,0.0,88.391539,1.0169390000000001e-96
1,0.324472,0.589129,4.078385,5.630844,92.868425,0.0,87.198465,6.81297e-91
2,0.334987,0.538995,4.160538,5.09217,92.649187,0.0,88.129747,2.1827250000000002e-95
3,0.327073,0.600896,4.105946,5.296863,92.778482,0.0,87.240475,4.3478319999999996e-91
4,0.319048,0.511827,4.058155,5.274798,93.02417,0.0,88.379917,1.167054e-96
5,0.329854,0.48296,4.115488,5.11427,92.832274,0.0,88.198844,9.785501e-96
6,0.318998,0.587326,4.055967,5.296176,92.981832,0.0,87.537286,1.738127e-92
7,0.332031,0.58666,4.119576,5.459542,92.750612,0.0,86.167846,2.5922269999999998e-86
8,0.335754,0.498666,4.163789,5.082051,92.621587,0.0,89.123447,1.279193e-100
9,0.33082,0.449098,4.127821,4.992459,92.771965,0.0,89.780868,2.278934e-104


In [265]:
model = RandomForestRegressor(n_estimators=100,n_jobs=multiprocessing.cpu_count(),random_state=42)
result_df, result_detail_df = predictAffinityWithModel(model)

100%|██████████| 10/10 [01:51<00:00, 11.12s/it]


In [266]:
result_detail_df

Unnamed: 0,train_mse_errors,val_mse_errors,train_mae_errors,val_mae_errors,train_corrs,train_corr_pvals,validation_corr,validation_corr_pval
0,0.067971,0.512242,1.866562,5.102455,98.824453,0.0,87.71258,2.496942e-93
1,0.068787,0.467739,1.87158,5.115102,98.809207,0.0,90.047569,5.797973e-106
2,0.069194,0.464029,1.873202,4.719238,98.79628,0.0,89.758201,3.098898e-104
3,0.068875,0.460714,1.886,4.812984,98.80024,0.0,90.862625,3.93908e-111
4,0.067144,0.452343,1.847921,5.028294,98.829882,0.0,90.306151,1.491504e-107
5,0.072835,0.397109,1.913236,4.638443,98.72621,0.0,90.613376,1.681595e-109
6,0.069203,0.519627,1.893756,5.041976,98.782777,0.0,89.308607,1.190835e-101
7,0.069757,0.528834,1.873558,5.334034,98.770443,0.0,88.023042,7.461082e-95
8,0.069177,0.455742,1.879444,4.983926,98.790709,0.0,90.189768,7.844587e-107
9,0.067444,0.522798,1.851184,5.237294,98.825737,0.0,88.434159,6.1299e-97


In [222]:
## For K = 10

In [224]:
model = linear_model.BayesianRidge()
predictAffinityWithModel(model)

100%|██████████| 10/10 [00:09<00:00,  1.04it/s]


Unnamed: 0,train_mse_error,train_mse_std,val_mse_error,val_mse_std,train_mae_error,train_mae_std,val_mae_error,val_mae_std,train_corr,train_corr_pval,validation_corr,validation_corr_pval
0,0.3286,0.0058,0.5331,0.0521,4.1118,0.036,5.2423,0.1832,0.928,0.0,0.8801,0.0


In [248]:
model = RandomForestRegressor(n_estimators=100,n_jobs=multiprocessing.cpu_count(),random_state=42)
predictAffinityWithModel(model)

100%|██████████| 10/10 [01:51<00:00, 11.10s/it]


Unnamed: 0,train_mse_error,train_mse_std,val_mse_error,val_mse_std,train_mae_error,train_mae_std,val_mae_error,val_mae_std,train_corr,train_corr_pval,validation_corr,validation_corr_pval
0,0.069,0.0015,0.4781,0.0397,1.8756,0.0183,5.0014,0.2095,0.988,0.0,0.8953,0.0


In [246]:
model = RandomForestRegressor(n_estimators=50,n_jobs=multiprocessing.cpu_count(),random_state=42,max_depth=15)
predictAffinityWithModel(model)

100%|██████████| 10/10 [00:51<00:00,  5.19s/it]


Unnamed: 0,train_mse_error,train_mse_std,val_mse_error,val_mse_std,train_mae_error,train_mae_std,val_mae_error,val_mae_std,train_corr,train_corr_pval,validation_corr,validation_corr_pval
0,0.0851,0.0022,0.4954,0.0404,2.1359,0.0249,5.0913,0.2176,0.9846,0.0,0.8909,0.0


In [247]:
model = RandomForestRegressor(n_estimators=30,n_jobs=multiprocessing.cpu_count(),random_state=42,max_depth=12)
predictAffinityWithModel(model)

100%|██████████| 10/10 [00:29<00:00,  2.96s/it]


Unnamed: 0,train_mse_error,train_mse_std,val_mse_error,val_mse_std,train_mae_error,train_mae_std,val_mae_error,val_mae_std,train_corr,train_corr_pval,validation_corr,validation_corr_pval
0,0.123,0.005,0.51,0.0368,2.6172,0.0532,5.1914,0.2041,0.9769,0.0,0.8874,0.0


In [250]:
model = RandomForestRegressor(n_estimators=30,n_jobs=multiprocessing.cpu_count(),random_state=42,max_depth=10)
predictAffinityWithModel(model)

100%|██████████| 10/10 [00:29<00:00,  2.93s/it]


Unnamed: 0,train_mse_error,train_mse_std,val_mse_error,val_mse_std,train_mae_error,train_mae_std,val_mae_error,val_mae_std,train_corr,train_corr_pval,validation_corr,validation_corr_pval
0,0.1802,0.0083,0.533,0.0321,3.2117,0.0651,5.3573,0.2056,0.9651,0.0,0.882,0.0


In [254]:
model = RandomForestRegressor(n_estimators=15,n_jobs=multiprocessing.cpu_count(),random_state=42,max_depth=10)
predictAffinityWithModel(model)

100%|██████████| 10/10 [00:14<00:00,  1.41s/it]


Unnamed: 0,train_mse_error,train_mse_std,val_mse_error,val_mse_std,train_mae_error,train_mae_std,val_mae_error,val_mae_std,train_corr,train_corr_pval,validation_corr,validation_corr_pval
0,0.1914,0.0095,0.5519,0.0328,3.2654,0.069,5.4301,0.2036,0.9621,0.0,0.8768,0.0


In [245]:
np.min([i.tree_.max_depth for i in model.estimators_])

24