# Single protein models by position results processing

Here I put together the validation and testing results for the single protein models by positions.
They are saved in a Python-specific format (joblib dump) and include several runs.
I save the output as a more universal compressed csv file.

## Test set results

In [14]:
import joblib
import pandas as pd

infile = "../../testing_results/single_protein_models/by_position/validation_and_test_20210609-200417.joblib.xz"
outfile = "../../testing_results/single_protein_models/by_position/test_results.csv.xz"

result_dump = joblib.load(infile)
result_dump["data"].to_csv(outfile, compression="xz", index=False)
pd.read_csv(outfile)

Unnamed: 0,dms_id,y_pred,y_true,kind,aa1,aa2,position
0,E1_Ubiquitin,-0.108041,-0.257361,validation,G,N,35
1,E1_Ubiquitin,-0.094417,0.005115,validation,E,R,64
2,E1_Ubiquitin,-0.251826,-0.121822,validation,G,W,53
3,E1_Ubiquitin,-0.656793,-1.723066,validation,I,E,23
4,E1_Ubiquitin,-0.253355,-0.128302,validation,G,V,53
...,...,...,...,...,...,...,...
21508,kka2_1:2,-0.140835,0.665570,test,L,I,81
21509,kka2_1:2,-1.949568,-1.702102,test,Y,C,244
21510,kka2_1:2,-0.188886,0.556926,test,A,P,13
21511,kka2_1:2,0.566503,1.124670,test,E,D,17


## Feature importance

In [15]:
outfile = "../../testing_results/single_protein_models/by_position/feature_importance.csv.xz"

result_dump = joblib.load(infile)
result_dump["feature_importance"].to_csv(outfile, compression="xz", index=False)
pd.read_csv(outfile)

Unnamed: 0,dms_id,feature_group,importance_1,importance_2,importance_3,importance_4,importance_5,importance_average,importance_sem
0,E1_Ubiquitin,tr_rosetta_centrality,0.009977,0.016575,0.014719,0.013602,0.024096,0.015794,0.002338
1,E1_Ubiquitin,hmm_pssm,0.098644,0.080886,0.064135,0.072657,0.083545,0.079973,0.005773
2,E1_Ubiquitin,netsurf_solvent_accessibility,-0.002092,-0.000845,-0.001125,-0.000058,-0.000065,-0.000837,0.000378
3,E1_Ubiquitin,netsurf_secondary_structure,0.004604,0.001429,0.006067,-0.000014,0.004773,0.003372,0.001140
4,E1_Ubiquitin,netsurf_disorder,0.000506,0.000297,0.000726,0.000362,-0.000248,0.000329,0.000162
...,...,...,...,...,...,...,...,...,...
76,kka2_1:2,netsurf_disorder,-0.000028,0.000302,-0.000739,0.000576,-0.001002,-0.000178,0.000301
77,kka2_1:2,netsurf_torsion_angles,-0.000084,-0.001656,-0.001090,-0.001775,-0.000490,-0.001019,0.000327
78,kka2_1:2,ev_couplings,0.638058,0.665601,0.678896,0.672368,0.634335,0.657852,0.009106
79,kka2_1:2,aa1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Hyperparameter search

In [2]:
import os

import joblib
import pandas as pd

infile_dir = "../../hyperparameter_search/single_protein_models/by_position/"
infiles = [
    os.path.join(infile_dir, f)
    for f in os.listdir(infile_dir)
    if f.endswith(".joblib.xz")
]
outfile = "../../hyperparameter_search/single_protein_models/by_position/random_search_results.csv.xz"

result_df = pd.DataFrame()
for infile in infiles:
    result_dump = joblib.load(infile)
    if isinstance(result_dump["data"], dict):
        curr_results = result_dump["data"]["pearson"]
    elif isinstance(result_dump["data"], pd.DataFrame):
        curr_results = result_dump["data"]
    else:
        print(type(result_dump["data"]))
        raise AssertionError
    # In the first runs I used a different format so I need to account for it
    params_random_search = [
        el[1] if len(el) == 2 else el for el in result_dump["params"]
    ]
    hyperparam_df = pd.DataFrame()
    for name in curr_results.index:
        param_index = int(name.split("_")[1])
        try:
            curr_params = pd.DataFrame(params_random_search[param_index], index=[name])
        except:
            print(infile, len(params_random_search), name, param_index)
            raise AssertionError
        hyperparam_df = hyperparam_df.append(curr_params)
    curr_results = curr_results.join(hyperparam_df)
    result_df = result_df.append(curr_results)

result_df.to_csv(outfile, compression="xz", index=False)
pd.read_csv(outfile)

Unnamed: 0,E1_Ubiquitin,PSD95pdz3,Pab1,Ubiquitin,WW_domain,beta-lactamase,gb1,hsp90,kka2_1:2,average,...,subsample,colsample_bytree,eta,tree_method,objective,nthread,gamma,lambda,alpha,num_rounds
0,0.454747,0.485550,0.735590,0.606341,0.531383,0.813383,0.182016,0.750821,0.608434,0.574252,...,0.6,1.00,0.01,hist,reg:squarederror,-1,0.1,4.0,0.0,600
1,0.454974,0.485788,0.730324,0.605262,0.529976,0.815506,0.185478,0.750944,0.608595,0.574094,...,0.6,1.00,0.01,hist,reg:squarederror,-1,0.1,4.0,0.0,900
2,0.454282,0.485847,0.732627,0.605124,0.530846,0.814364,0.183831,0.750309,0.609105,0.574037,...,0.6,1.00,0.01,hist,reg:squarederror,-1,0.1,4.0,0.0,700
3,0.454232,0.486344,0.731952,0.605079,0.529468,0.815134,0.184834,0.750619,0.608511,0.574019,...,0.6,1.00,0.01,hist,reg:squarederror,-1,0.1,4.0,0.0,800
4,0.453761,0.484815,0.729207,0.605157,0.530251,0.815754,0.187229,0.750701,0.607726,0.573845,...,0.6,1.00,0.01,hist,reg:squarederror,-1,0.1,4.0,0.0,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2620,0.300212,0.182519,0.124282,0.249299,0.404887,0.378726,0.217103,0.402842,0.188660,0.272059,...,0.5,0.25,1.00,hist,reg:squarederror,-1,0.1,1.0,0.1,900
2621,0.303781,0.182806,0.125281,0.234794,0.405859,0.379679,0.216622,0.403661,0.189149,0.271292,...,0.5,0.25,1.00,hist,reg:squarederror,-1,0.1,1.0,0.1,600
2622,0.311040,0.190703,0.124780,0.216972,0.405086,0.378152,0.215219,0.403303,0.189727,0.270554,...,0.5,0.25,1.00,hist,reg:squarederror,-1,0.1,1.0,0.1,700
2623,0.287802,0.178163,0.121352,0.241401,0.406306,0.378989,0.217947,0.409601,0.187013,0.269841,...,0.5,0.25,1.00,hist,reg:squarederror,-1,0.1,1.0,0.1,1000
