# LOPO model XGboost results processing

Here I put together the validation and testing results for the LOPO XGBoost model.
They are saved in a Python-specific format (joblib dum|p) and include several runs.
I save the output as a more universal compressed csv file.

## Test set results

In [2]:
import joblib
import pandas as pd

infile = "../../testing_results/lopo_models/gradient_boosted_trees/validation_and_test_20210609-135319.joblib.xz"
outfile = "../../testing_results/lopo_models/gradient_boosted_trees/test_results.csv.xz"

result_dump = joblib.load(infile)
result_dump["data"].to_csv(outfile, compression="xz", index=False)
pd.read_csv(outfile)

Unnamed: 0,dms_id,y_pred,y_true,kind,aa1,aa2,position
0,kka2_1:2,-2.716779,-2.444149,validation,C,Q,209
1,kka2_1:2,0.106698,0.565598,validation,S,L,68
2,kka2_1:2,-0.491488,-2.437260,validation,R,V,217
3,kka2_1:2,-1.771364,-2.669310,validation,A,T,215
4,kka2_1:2,-0.349191,-2.989750,validation,E,M,161
...,...,...,...,...,...,...,...
21508,beta-lactamase,-0.647458,-1.639610,test,D,G,99
21509,beta-lactamase,-0.139114,-0.946275,test,Q,F,203
21510,beta-lactamase,-0.702377,-4.449380,test,L,E,219
21511,beta-lactamase,-0.502695,-8.106990,test,L,K,192


## Feature importance

In [3]:
outfile = "../../testing_results/lopo_models/gradient_boosted_trees/feature_importance.csv.xz"

result_dump = joblib.load(infile)
result_dump["feature_importance"].to_csv(outfile, compression="xz", index=False)
pd.read_csv(outfile)

Unnamed: 0,dms_id,feature_group,importance_1,importance_2,importance_3,importance_4,importance_5,importance_average,importance_sem
0,kka2_1:2,tr_rosetta_centrality,0.003617,0.006675,0.003244,0.008902,0.004475,0.005382,0.001062
1,kka2_1:2,hmm_pssm,0.002732,0.005828,0.012540,0.004579,0.002801,0.005696,0.001806
2,kka2_1:2,netsurf_solvent_accessibility,0.000796,0.006880,0.011525,0.011774,0.008138,0.007823,0.001996
3,kka2_1:2,netsurf_secondary_structure,-0.005466,-0.006483,-0.005044,-0.005616,-0.005780,-0.005678,0.000236
4,kka2_1:2,netsurf_disorder,-0.003173,-0.003217,-0.003555,-0.002441,-0.002726,-0.003023,0.000196
...,...,...,...,...,...,...,...,...,...
76,beta-lactamase,netsurf_disorder,0.005492,0.006324,0.005739,0.005739,0.004056,0.005470,0.000379
77,beta-lactamase,netsurf_torsion_angles,0.000021,0.000219,0.000061,0.000191,0.000253,0.000149,0.000046
78,beta-lactamase,ev_couplings,0.631717,0.610491,0.631811,0.635493,0.639626,0.629828,0.005047
79,beta-lactamase,aa1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Hyperparameter search

In [2]:
import os

import joblib
import pandas as pd

infile_dir = "../../hyperparameter_search/lopo_models/gradient_boosted_trees/"
infiles = [
    os.path.join(infile_dir, f)
    for f in os.listdir(infile_dir)
    if f.endswith(".joblib.xz")
]
outfile = "../../hyperparameter_search/lopo_models/gradient_boosted_trees/random_search_results.csv.xz"

result_df = pd.DataFrame()
for infile in infiles:
    result_dump = joblib.load(infile)
    curr_results = result_dump["data"]["spearman"]
    params_random_search = [el[1] for el in result_dump["params"]]
    hyperparam_df = pd.DataFrame()
    for name in curr_results.index:
        param_index = int(name.split("_")[1])
        try:
            curr_params = pd.DataFrame(params_random_search[param_index], index=[name])
        except:
            print(infile, len(params_random_search), name, param_index)
            raise AssertionError
        hyperparam_df = hyperparam_df.append(curr_params)
    curr_results = curr_results.join(hyperparam_df)
    result_df = result_df.append(curr_results)

result_df.to_csv(outfile, compression="xz", index=False)
pd.read_csv(outfile)

Unnamed: 0,hsp90,PSD95pdz3,E1_Ubiquitin,beta-lactamase,gb1,Pab1,kka2_1:2,Ubiquitin,WW_domain,average,...,subsample,colsample_bytree,eta,tree_method,objective,nthread,gamma,lambda,alpha,num_rounds
0,0.446671,0.509882,0.524852,0.710415,0.410223,0.666514,0.597166,0.606456,0.680818,0.572555,...,0.2,0.4,0.005,hist,rank:pairwise,1,1.000,1.00,0.00,1000
1,0.439174,0.516854,0.532165,0.711841,0.403700,0.657219,0.592032,0.599352,0.668566,0.568989,...,0.8,0.5,0.010,hist,rank:pairwise,1,1.000,0.10,0.00,1000
2,0.431177,0.531016,0.529488,0.681991,0.408103,0.667641,0.570326,0.609718,0.687150,0.568512,...,0.2,0.2,0.005,hist,rank:pairwise,1,0.010,10.00,0.01,1000
3,0.433719,0.559512,0.530610,0.689068,0.325894,0.699174,0.571171,0.626643,0.670138,0.567325,...,0.9,0.1,0.010,hist,rank:pairwise,1,0.000,10.00,0.01,1000
4,0.425825,0.523142,0.571924,0.675503,0.406193,0.639129,0.577956,0.553352,0.661853,0.559431,...,0.5,0.3,0.001,hist,rank:pairwise,1,0.001,0.01,10.00,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.469249,0.354774,0.268822,0.626981,0.416012,0.488603,0.549628,0.280238,0.589077,0.449265,...,0.2,0.4,0.010,hist,rank:pairwise,-1,0.000,1.00,0.00,100000
1456,0.423149,0.239277,0.210721,0.559754,0.444297,0.333062,0.507990,0.343596,0.602358,0.407134,...,0.2,0.4,0.050,hist,rank:pairwise,-1,0.000,1.00,0.00,100000
1457,0.395757,0.189152,0.097193,0.496353,0.002065,0.249397,0.395562,0.285503,0.434443,0.282825,...,0.2,0.4,0.100,hist,rank:pairwise,-1,0.000,1.00,0.00,100000
1458,0.046678,-0.020918,-0.023273,-0.080115,-0.230361,0.223813,0.079928,-0.007970,0.329305,0.035232,...,0.2,0.4,0.500,hist,rank:pairwise,-1,0.000,1.00,0.00,100000
