In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats.stats import pearsonr
import seaborn as sns

**FIGURE COEFFICIENTS**

In [None]:
# load data and show non zero coefficients
coef = pd.read_excel('coef_lasso_loocv.xlsx').rename(columns = {"Unnamed: 0" : "Feature"})
coef.loc[:, ('abs')] = coef.apply(lambda row: abs(row.coefficient) ,axis = 1)
non_zero = [item for item in coef.sort_values(['abs'], ascending = False)[coef.coefficient != 0].set_index('Feature').index]
print(non_zero)

# transpose data and get non-zero coefficients for the figure
coef = coef.drop(columns = ['coefficient', 'std', 'abs']).set_index('Feature')
coef = coef.transpose()[non_zero].rename(columns = {'gender_F' : 'sex_F'})
coef

In [None]:
# load data and show non zero coefficients
coef_bm = pd.read_excel('coef_lasso_loocv_nonzero_bestmodel.xlsx').rename(columns = {"Unnamed: 0" : "Feature"})

# transpose data and get non-zero coefficients for the figure
coef_bm = coef_bm.drop(columns = ['coefficient', 'std']).set_index('Feature')
coef_bm = coef_bm.transpose()[non_zero].rename(columns = {'gender_F' : 'sex_F'})
coef_bm

In [None]:
# make figure
sns.reset_defaults()
sns.set(rc={'figure.figsize':(16,8)})
sns.set_style("whitegrid")

fig, (ax1, ax2) = plt.subplots(1, 2)

sns.violinplot(data = coef,  orient = 'h', scale = 'width', ax = ax1,  color = 'skyblue', bw = 0.1, saturation = 0.7)

xlim = ax1.get_xlim()
ylim = ax1.get_ylim()
for violin in ax1.collections:
    bbox = violin.get_paths()[0].get_extents()
    x0, y0, width, height = bbox.bounds
    violin.set_clip_path(plt.Rectangle((x0, y0), width, height / 2, transform=ax1.transData))

old_len_collections = len(ax1.collections)
sns.stripplot(data = coef, orient = 'h', ax = ax1, size = 2.5, color = 'blue').set(ylabel = None)

for dots in ax1.collections[old_len_collections:]:
    dots.set_offsets(dots.get_offsets() + np.array([0, 0.12]))
ax1.set_xlim(xlim)
ax1.set_ylim(ylim)
ax1.set_xlabel("Coefficients", fontsize = 11)
ax1.set_title("A. Coefficients lasso", fontsize = 15, loc = 'left', fontweight = 'bold')

# plot best model

sns.violinplot(data = coef_bm,  orient = 'h', scale = 'width', ax = ax2,  color = 'skyblue', bw = 0.1, saturation = 0.7)

xlim = ax2.get_xlim()
ylim = ax2.get_ylim()
for violin in ax2.collections:
    bbox = violin.get_paths()[0].get_extents()
    x0, y0, width, height = bbox.bounds
    violin.set_clip_path(plt.Rectangle((x0, y0), width, height / 2, transform=ax2.transData))

old_len_collections = len(ax2.collections)
sns.stripplot(data = coef_bm, orient = 'h', ax = ax2, size = 2.5, color = 'blue').set(ylabel = None, yticks = [])

for dots in ax2.collections[old_len_collections:]:
    dots.set_offsets(dots.get_offsets() + np.array([0, 0.12]))
ax2.set_xlim(xlim)
ax2.set_ylim(ylim)
ax2.set_xlabel("Coefficients", fontsize = 11)
ax2.set_title("B. Coefficients lasso winning model", fontsize = 15, loc = 'left', fontweight = 'bold')

plt.subplots_adjust(wspace = 0.02)

# plt.savefig("coef_violin_nonzero_bm.png", bbox_inches="tight")
plt.show()

In [None]:
table_coef = coef.transpose()
calc_coef = coef.transpose()

table_coef['mean'] = calc_coef.apply(lambda row: format(100 * np.mean(row), '.4f'), axis = 1)
table_coef['SD'] = calc_coef.apply(lambda row: format(100 * np.std(row), '.4f'), axis = 1)
table_coef['mean_'] = calc_coef.apply(lambda row: format(np.mean(row), '.4f'), axis = 1)
table_coef['sd_'] = calc_coef.apply(lambda row: format(np.std(row), '.4f'), axis = 1)

table_coef['median'] = calc_coef.apply(lambda row: format(100 * np.median(row), '.4f'), axis = 1)
table_coef['95%'] = calc_coef.apply(lambda row: [format(100 * np.quantile(row, 0.05), '.4f'), format(100 *  np.quantile(row, 0.95), '.4f')] if row.name == 'RANDOM' else ([format(100 * np.quantile(row, 0.05), '.4f'), format(100 * row.max(), '.4f')] if row.mean() > 0 else [format(100 * row.min(), '.4f'), format(100 * np.quantile(row, 0.95), '.4f')]), axis = 1)
table_coef['min'] = calc_coef.apply(lambda row: format(100 * np.min(row), '.4f'), axis = 1)
table_coef['max'] = calc_coef.apply(lambda row: format(100 * np.max(row), '.4f'), axis = 1)

table_coef = table_coef[['mean', 'SD', 'median', '95%', 'min', 'max']]
# table_coef.to_excel("coef_descriptives.xlsx")
table_coef

In [None]:
table_coef = coef.transpose()
calc_coef = coef.transpose()
table_coef['std_1'] = calc_coef.std(axis = 1, ddof = 0)
table_coef['std_2'] = calc_coef.apply(lambda row: np.std(row), axis = 1)
table_coef

In [None]:
table_coef_bm = coef_bm.transpose()
calc_coef_bm = coef_bm.transpose()

table_coef_bm['mean'] = calc_coef_bm.apply(lambda row: format(100 * np.mean(row), '.4f'), axis = 1)
table_coef_bm['SD'] = calc_coef_bm.apply(lambda row: format(100 * np.std(row), '.4f'), axis = 1)
table_coef_bm['median'] = calc_coef_bm.apply(lambda row: format(100 * np.median(row), '.4f'), axis = 1)
table_coef_bm['95%'] = calc_coef_bm.apply(lambda row: [format(100 * np.quantile(row, 0.05), '.4f'), format(100 *  np.quantile(row, 0.95), '.4f')] if row.name == 'RANDOM' else ([format(100 * np.quantile(row, 0.05), '.4f'), format(100 * row.max(), '.4f')] if row.mean() > 0 else [format(100 * row.min(), '.4f'), format(100 * np.quantile(row, 0.95), '.4f')]), axis = 1)
table_coef_bm['min'] = calc_coef_bm.apply(lambda row: format(100 * np.min(row), '.4f'), axis = 1)
table_coef_bm['max'] = calc_coef_bm.apply(lambda row: format(100 * np.max(row), '.4f'), axis = 1)

table_coef_bm = table_coef_bm[['mean', 'SD', 'median', '95%', 'min', 'max']]
# table_coef_bm.to_excel("coef_bm_descriptives.xlsx")
table_coef_bm

**FIGURE PERMUTATION IMPORTANCE**

In [None]:
# load data
perm_imp = pd.read_excel('RF_LOOCV_perm_importance.xlsx').rename(columns = {"Unnamed: 0" : "Feature"})
perm_imp = perm_imp.sort_values(['importance'], ascending = False).set_index('Feature').drop(columns = ['importance', 'std']).head(25)
perm_imp = perm_imp.transpose()
perm_imp

In [None]:
# load data
perm_imp_bm = pd.read_excel('RF_LOOCV_perm_importance_bestmodel_nonzero.xlsx').rename(columns = {"Unnamed: 0" : "Feature"})
perm_imp_bm = perm_imp_bm.set_index('Feature').drop(columns = ['importance', 'std'])
perm_imp_bm = perm_imp_bm.transpose()
perm_imp_bm

In [None]:
# make figure
sns.reset_defaults()
sns.set(rc={'figure.figsize':(16,12.5)})
sns.set_style("whitegrid")

fig, (ax1, ax2) = plt.subplots(1, 2)

sns.violinplot(data = perm_imp,  orient = 'h', scale = 'width', ax = ax1,  color = 'skyblue', bw = 0.1, saturation = 0.7)

xlim = ax1.get_xlim()
ylim = ax1.get_ylim()
for violin in ax1.collections:
    bbox = violin.get_paths()[0].get_extents()
    x0, y0, width, height = bbox.bounds
    violin.set_clip_path(plt.Rectangle((x0, y0), width, height / 2, transform=ax1.transData))

old_len_collections = len(ax1.collections)
sns.stripplot(data = perm_imp, orient = 'h', ax = ax1, size = 2.5, color = 'blue').set(ylabel = None)

for dots in ax1.collections[old_len_collections:]:
    dots.set_offsets(dots.get_offsets() + np.array([0, 0.12]))
ax1.set_xlim(xlim)
ax1.set_ylim(ylim)
ax1.set_xlabel("Permutation importance", fontsize = 11)
ax1.set_title("A. Importance RF", fontsize = 15, loc = 'left', fontweight = 'bold')

# plot best model

sns.violinplot(data = perm_imp_bm,  orient = 'h', scale = 'width', ax = ax2,  color = 'skyblue', bw = 0.1, saturation = 0.7)

xlim = ax2.get_xlim()
ylim = ax2.get_ylim()
for violin in ax2.collections:
    bbox = violin.get_paths()[0].get_extents()
    x0, y0, width, height = bbox.bounds
    violin.set_clip_path(plt.Rectangle((x0, y0), width, height / 2, transform=ax2.transData))

old_len_collections = len(ax2.collections)
sns.stripplot(data = perm_imp_bm, orient = 'h', ax = ax2, size = 2.5, color = 'blue').set(ylabel = None, yticks = [])

for dots in ax2.collections[old_len_collections:]:
    dots.set_offsets(dots.get_offsets() + np.array([0, 0.12]))
ax2.set_xlim(xlim)
ax2.set_ylim(ylim)
ax2.set_xlabel("Permutation importance", fontsize = 11)
ax2.set_title("B. Importance RF winning model", fontsize = 15, loc = 'left', fontweight = 'bold')

plt.subplots_adjust(wspace = 0.02)

plt.savefig("perm_imp_violin_nonzero_bm.png", bbox_inches="tight")
plt.show()

In [None]:
table_perm = perm_imp.transpose()
calc_perm = perm_imp.transpose()

table_perm['mean'] = calc_perm.apply(lambda row: format(100 * np.mean(row), '.4f'), axis = 1)
table_perm['SD'] = calc_perm.apply(lambda row: format(100 * np.std(row), '.4f'), axis = 1)
table_perm['median'] = calc_perm.apply(lambda row: format(100 * np.median(row), '.4f'), axis = 1)
table_perm['95%'] = calc_perm.apply(lambda row: [format(100 * np.quantile(row, 0.05), '.4f'), format(100 * row.max(), '.4f')] if 'RANDOM' != row.name else [format(100 * row.min(), '.4f'), format(100 * np.quantile(row, 0.95), '.4f')], axis = 1)
table_perm['min'] = calc_perm.apply(lambda row: format(100 * np.min(row), '.4f'), axis = 1)
table_perm['max'] = calc_perm.apply(lambda row: format(100 * np.max(row), '.4f'), axis = 1)

table_perm = table_perm[['mean', 'SD', 'median', '95%', 'min', 'max']]
# table_perm.to_excel("perm_imp_descriptives.xlsx")
table_perm

In [None]:
table_perm_bm = perm_imp_bm.transpose()
calc_perm_bm = perm_imp_bm.transpose()

table_perm_bm['mean'] = calc_perm_bm.apply(lambda row: format(100 * np.mean(row), '.4f'), axis = 1)
table_perm_bm['SD'] = calc_perm_bm.apply(lambda row: format(100 * np.std(row), '.4f'), axis = 1)
table_perm_bm['median'] = calc_perm_bm.apply(lambda row: format(100 * np.median(row), '.4f'), axis = 1)
table_perm_bm['95%'] = calc_perm_bm.apply(lambda row: [format(100 * np.quantile(row, 0.05), '.4f'), format(100 * row.max(), '.4f')] if 'RANDOM' != row.name else [format(100 * row.min(), '.4f'), format(100 * np.quantile(row, 0.95), '.4f')], axis = 1)
table_perm_bm['min'] = calc_perm_bm.apply(lambda row: format(100 * np.min(row), '.4f'), axis = 1)
table_perm_bm['max'] = calc_perm_bm.apply(lambda row: format(100 * np.max(row), '.4f'), axis = 1)

table_perm_bm = table_perm_bm[['mean', 'SD', 'median', '95%', 'min', 'max']]
# table_perm_bm.to_excel("perm_imp_bestmodel_descriptives.xlsx")
table_perm_bm

**FIGURE PAPER TOP 7**

In [None]:
[item for item in perm_imp.mean().head(11).keys()]

In [None]:
perm_top7 = [item for item in perm_imp.mean().head(7).keys()]
perm_top7.append('RANDOM')
importance = perm_imp[perm_top7]

coefbm_top7 = [item for item in coef_bm.mean().abs().sort_values(ascending = False).head(7).keys()]
coefbm_top7.append('RANDOM')
coefficient = coef_bm[coefbm_top7]

# make figure
sns.reset_defaults()
sns.set(rc={'figure.figsize':(10,10)})
sns.set_style("whitegrid")

fig, (ax1, ax2) = plt.subplots(2, 1)

ax1.tick_params(axis='y', labelsize=13)

sns.violinplot(data = coefficient,  orient = 'h', scale = 'width', ax = ax1,  color = 'skyblue', bw = 0.1, saturation = 0.7)

xlim = ax1.get_xlim()
ylim = ax1.get_ylim()
for violin in ax1.collections:
    bbox = violin.get_paths()[0].get_extents()
    x0, y0, width, height = bbox.bounds
    violin.set_clip_path(plt.Rectangle((x0, y0), width, height / 2, transform=ax1.transData))

old_len_collections = len(ax1.collections)
sns.stripplot(data = coefficient, orient = 'h', ax = ax1, size = 2.5, color = 'blue').set(ylabel = None)

for dots in ax1.collections[old_len_collections:]:
    dots.set_offsets(dots.get_offsets() + np.array([0, 0.12]))
ax1.set_xlim(xlim)
ax1.set_ylim(ylim)
ax1.set_xlabel("Coefficient", fontsize = 14)
ax1.set_title("A. Coefficients lasso (winning model)", fontsize = 16, loc = 'left', fontweight = 'bold')



# plot best model
ax2.tick_params(axis='y', labelsize=13)

sns.violinplot(data = importance,  orient = 'h', scale = 'width', ax = ax2,  color = 'skyblue', bw = 0.1, saturation = 0.7)

xlim = ax2.get_xlim()
ylim = ax2.get_ylim()
for violin in ax2.collections:
    bbox = violin.get_paths()[0].get_extents()
    x0, y0, width, height = bbox.bounds
    violin.set_clip_path(plt.Rectangle((x0, y0), width, height / 2, transform=ax2.transData))

old_len_collections = len(ax2.collections)
sns.stripplot(data = importance, orient = 'h', ax = ax2, size = 2.5, color = 'blue').set(ylabel = None)

for dots in ax2.collections[old_len_collections:]:
    dots.set_offsets(dots.get_offsets() + np.array([0, 0.12]))
ax2.set_xlim(xlim)
ax2.set_ylim(ylim)
ax2.set_xlabel("Permutation importance", fontsize = 14)
ax2.set_title("B. Importance random forest", fontsize = 16, loc = 'left', fontweight = 'bold')

plt.subplots_adjust(hspace = 0.3)

plt.savefig("importance_coefficients.png", bbox_inches="tight")
plt.show()