# Single protein models naive results processing

Here I put together the validation and testing results for the single protein naive models.
They are saved in a Python-specific format (joblib dump) and include several runs.
I save the output as a more universal compressed csv file.

## Test set results

In [21]:
import joblib
import pandas as pd

infile = "../../testing_results/single_protein_models/naive/validation_and_test_20210609-200353.joblib.xz"
outfile = "../../testing_results/single_protein_models/naive/test_results.csv.xz"

result_dump = joblib.load(infile)
result_dump["data"].to_csv(outfile, compression="xz", index=False)
pd.read_csv(outfile)

Unnamed: 0,dms_id,y_pred,y_true,kind,aa1,aa2,position
0,E1_Ubiquitin,-0.000725,0.007452,validation,S,S,28
1,E1_Ubiquitin,0.004436,0.129888,validation,D,M,32
2,E1_Ubiquitin,0.037263,-0.004257,validation,D,C,32
3,E1_Ubiquitin,-0.007274,0.108820,validation,F,V,4
4,E1_Ubiquitin,-0.016370,-0.030289,validation,K,P,63
...,...,...,...,...,...,...,...
21508,kka2_1:2,-1.975785,-3.059738,test,D,L,261
21509,kka2_1:2,-2.583013,-2.556188,test,D,P,220
21510,kka2_1:2,-2.667599,-1.928955,test,M,H,116
21511,kka2_1:2,-0.844057,-3.435789,test,L,F,241


## Feature importance

In [22]:
outfile = "../../testing_results/single_protein_models/naive/feature_importance.csv.xz"

result_dump = joblib.load(infile)
result_dump["feature_importance"].to_csv(outfile, compression="xz", index=False)
pd.read_csv(outfile)

Unnamed: 0,dms_id,feature_group,importance_1,importance_2,importance_3,importance_4,importance_5,importance_average,importance_sem
0,E1_Ubiquitin,tr_rosetta_centrality,-0.000457,-0.000405,-0.000992,-0.000297,-0.000413,-0.000513,0.000123
1,E1_Ubiquitin,hmm_pssm,0.155750,0.148608,0.156153,0.137257,0.150584,0.149670,0.003428
2,E1_Ubiquitin,netsurf_solvent_accessibility,0.004296,0.000435,-0.000051,0.001069,0.003400,0.001830,0.000855
3,E1_Ubiquitin,netsurf_secondary_structure,0.038629,0.029689,0.018312,0.021845,0.032391,0.028173,0.003652
4,E1_Ubiquitin,netsurf_disorder,0.007518,0.006816,0.006532,0.006474,0.002823,0.006033,0.000824
...,...,...,...,...,...,...,...,...,...
76,kka2_1:2,netsurf_disorder,-0.001487,-0.001014,-0.001856,-0.001408,-0.001327,-0.001419,0.000136
77,kka2_1:2,netsurf_torsion_angles,-0.001470,-0.002118,-0.002493,-0.002663,-0.001674,-0.002084,0.000229
78,kka2_1:2,ev_couplings,0.780958,0.767768,0.765661,0.745891,0.813760,0.774808,0.011235
79,kka2_1:2,aa1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [19]:
df = result_dump["feature_importance"]
df[df.dms_id == "beta-lactamase"]

Unnamed: 0,dms_id,feature_group,importance_1,importance_2,importance_3,importance_4,importance_5,importance_average,importance_sem
45,beta-lactamase,tr_rosetta_centrality,0.005782,0.00684,0.007677,0.007613,0.00683,0.006948,0.000343
46,beta-lactamase,hmm_pssm,0.045918,0.046665,0.042446,0.043126,0.040476,0.043726,0.00114
47,beta-lactamase,netsurf_solvent_accessibility,0.042793,0.046314,0.044066,0.047841,0.04205,0.044613,0.001084
48,beta-lactamase,netsurf_secondary_structure,0.024129,0.027605,0.027464,0.027439,0.02758,0.026843,0.000679
49,beta-lactamase,netsurf_disorder,0.002238,0.002105,0.002008,0.002159,0.001899,0.002082,5.9e-05
50,beta-lactamase,netsurf_torsion_angles,0.007774,0.006072,0.00621,0.005035,0.006053,0.006229,0.00044
51,beta-lactamase,ev_couplings,0.912638,0.926008,0.906137,0.903694,0.918303,0.913356,0.004065
52,beta-lactamase,aa1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,beta-lactamase,aa2,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Hyperparameter search

In [None]:
import os

import joblib
import pandas as pd

infile_dir = "../../hyperparameter_search/single_protein_models/naive/"
infiles = [
    os.path.join(infile_dir, f)
    for f in os.listdir(infile_dir)
    if f.endswith(".joblib.xz")
]
outfile = "../../hyperparameter_search/single_protein_models/naive/random_search_results.csv.xz"

result_df = pd.DataFrame()
for infile in infiles:
    result_dump = joblib.load(infile)
    if isinstance(result_dump["data"], dict):
        curr_results = result_dump["data"]["pearson"]
    elif isinstance(result_dump["data"], pd.DataFrame):
        curr_results = result_dump["data"]
    else:
        print(type(result_dump["data"]))
        raise AssertionError
    params_random_search = [el[1] for el in result_dump["params"]]
    hyperparam_df = pd.DataFrame()
    for name in curr_results.index:
        param_index = int(name.split("_")[1])
        try:
            curr_params = pd.DataFrame(params_random_search[param_index], index=[name])
        except:
            print(infile, len(params_random_search), name, param_index)
            raise AssertionError
        hyperparam_df = hyperparam_df.append(curr_params)
    curr_results = curr_results.join(hyperparam_df)
    result_df = result_df.append(curr_results)

result_df.to_csv(outfile, compression="xz", index=False)
pd.read_csv(outfile)