In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Model with dap-seq features
# Import T/F data
at_df = pd.read_csv("../../data/DAP_model_data/Model_input_data.tsv", sep="\t", header=0,index_col=0)
at_df_T = at_df[at_df.deg == 1].reset_index(drop=True)
at_df_F = at_df[at_df.deg == 0].reset_index(drop=True)

In [None]:
import random
from pycaret.classification import *
from sklearn.utils import shuffle

In [None]:
# Generate balance data and test set with multiple random_state value
balance_index = random.sample(range(100,1000),100)
test_set_index = random.sample(range(100,1000),100)

# Loop the random_state
AUC_list=[]
for i in range(100):
    at_df_F_use = at_df_F.sample(len(at_df_T),
                                 random_state = balance_index[i]).reset_index(drop=True)
    at_df_balance = at_df_T.append(at_df_F_use).reset_index(drop=True)
    gene_names=at_df.index
    at_df_label = at_df_balance['deg']
    at_df_balance_cor = at_df_balance[at_df_balance.corr().deg[
        abs(at_df_balance.corr().deg) > 0.1].index]

    TF_classify_balance_cor = setup(data=at_df_balance_cor,
                                    target='deg',
                                    train_size=0.7,
                                    silent=True,
                                    fold=10,
                                    normalize=True,
                                    normalize_method='zscore',
                                    feature_selection=True,
                                    pca=False,
                                    session_id=test_set_index[i],
                                    html=True)

    best_model = create_model('rf')
    prediction = predict_model(best_model)
    results=pull()
    AUC_list.append(results.AUC.array[0])

In [None]:
# Model without dap features
# Import T/F data
at_df_no_dap = pd.read_csv("../../data/DAP_model_data/no_dap.tsv", sep="\t", header=0,index_col=0)
at_df_no_dap_T = at_df_no_dap[at_df_no_dap.deg == 1].reset_index(drop=True)
at_df_no_dap_F = at_df_no_dap[at_df_no_dap.deg == 0].reset_index(drop=True)

In [None]:
# Generate balance data and test set with multiple random_state value
balance_index = random.sample(range(100,1000),100)
test_set_index = random.sample(range(100,1000),100)

# Loop the random_state
AUC_no_dap_list=[]
for i in range(100):
    at_df_no_dap_F_use = at_df_no_dap_F.sample(len(at_df_no_dap_T),
                                 random_state = balance_index[i]).reset_index(drop=True)
    at_df_no_dap_balance = at_df_no_dap_T.append(at_df_no_dap_F_use).reset_index(drop=True)
    gene_names=at_df.index
    at_df_no_dap_label = at_df_no_dap_balance['deg']
    at_df_no_dap_balance_cor = at_df_no_dap_balance[at_df_no_dap_balance.corr().deg[
        abs(at_df_no_dap_balance.corr().deg) > 0.1].index]

    TF_classify_balance_cor = setup(data=at_df_no_dap_balance_cor,
                                    target='deg',
                                    train_size=0.7,
                                    silent=True,
                                    fold=10,
                                    normalize=True,
                                    normalize_method='zscore',
                                    feature_selection=True,
                                    pca=False,
                                    session_id=test_set_index[i],
                                    html=True)

    best_model = compare_models(sort='AUC')
    prediction = predict_model(best_model)
    results=pull()
    AUC_no_dap_list.append(results.AUC.array[0])

In [None]:
# Model without signal features
# Import T/F data
at_df_no_signal = pd.read_csv("../../data/DAP_model_data/no_signal.tsv", sep="\t", header=0,index_col=0)
at_df_no_signal_T = at_df_no_signal[at_df_no_signal.deg == 1].reset_index(drop=True)
at_df_no_signal_F = at_df_no_signal[at_df_no_signal.deg == 0].reset_index(drop=True)

In [None]:
# Generate balance data and test set with multiple random_state value
balance_index = random.sample(range(100,1000),100)
test_set_index = random.sample(range(100,1000),100)

# Loop the random_state
AUC_no_signal_list=[]
for i in range(100):
    at_df_no_signal_F_use = at_df_no_signal_F.sample(len(at_df_no_signal_T),
                                 random_state = balance_index[i]).reset_index(drop=True)
    at_df_no_signal_balance = at_df_no_signal_T.append(at_df_no_signal_F_use).reset_index(drop=True)
    gene_names=at_df.index
    at_df_no_signal_label = at_df_no_signal_balance['deg']
    at_df_no_signal_balance_cor = at_df_no_signal_balance[at_df_no_signal_balance.corr().deg[
        abs(at_df_no_signal_balance.corr().deg) > 0.1].index]

    TF_classify_balance_cor = setup(data=at_df_no_signal_balance_cor,
                                    target='deg',
                                    train_size=0.7,
                                    silent=True,
                                    fold=10,
                                    normalize=True,
                                    normalize_method='zscore',
                                    feature_selection=True,
                                    pca=False,
                                    session_id=test_set_index[i],
                                    html=True)

    best_model = create_model('rf')
    prediction = predict_model(best_model)
    results=pull()
    AUC_no_signal_list.append(results.AUC.array[0])

In [None]:
# Model without fpkm features
# Import T/F data
at_df_no_fpkm = pd.read_csv("../../data/DAP_model_data/no_fpkm.tsv", sep="\t", header=0,index_col=0)
at_df_no_fpkm_T = at_df_no_fpkm[at_df_no_fpkm.deg == 1].reset_index(drop=True)
at_df_no_fpkm_F = at_df_no_fpkm[at_df_no_fpkm.deg == 0].reset_index(drop=True)

In [None]:
# Generate balance data and test set with multiple random_state value
balance_index = random.sample(range(100,1000),100)
test_set_index = random.sample(range(100,1000),100)

# Loop the random_state
AUC_no_fpkm_list=[]
for i in range(100):
    at_df_no_fpkm_F_use = at_df_no_fpkm_F.sample(len(at_df_no_fpkm_T),
                                 random_state = balance_index[i]).reset_index(drop=True)
    at_df_no_fpkm_balance = at_df_no_fpkm_T.append(at_df_no_fpkm_F_use).reset_index(drop=True)
    gene_names=at_df.index
    at_df_no_fpkm_label = at_df_no_fpkm_balance['deg']
    at_df_no_fpkm_balance_cor = at_df_no_fpkm_balance[at_df_no_fpkm_balance.corr().deg[
        abs(at_df_no_fpkm_balance.corr().deg) > 0.1].index]

    TF_classify_balance_cor = setup(data=at_df_no_fpkm_balance_cor,
                                    target='deg',
                                    train_size=0.7,
                                    silent=True,
                                    fold=10,
                                    normalize=True,
                                    normalize_method='zscore',
                                    feature_selection=True,
                                    pca=False,
                                    session_id=test_set_index[i],
                                    html=True)

    best_model = create_model('rf')
    prediction = predict_model(best_model)
    results=pull()
    AUC_no_fpkm_list.append(results.AUC.array[0])

In [None]:
# Model without distance features
# Import T/F data
at_df_no_distance = pd.read_csv("../../data/DAP_model_data/no_distance.tsv", sep="\t", header=0,index_col=0)
at_df_no_distance_T = at_df_no_distance[at_df_no_distance.deg == 1].reset_index(drop=True)
at_df_no_distance_F = at_df_no_distance[at_df_no_distance.deg == 0].reset_index(drop=True)

In [None]:
# Generate balance data and test set with multiple random_state value
balance_index = random.sample(range(100,1000),100)
test_set_index = random.sample(range(100,1000),100)

# Loop the random_state
AUC_no_distance_list=[]
for i in range(100):
    at_df_no_distance_F_use = at_df_no_distance_F.sample(len(at_df_no_distance_T),
                                 random_state = balance_index[i]).reset_index(drop=True)
    at_df_no_distance_balance = at_df_no_distance_T.append(at_df_no_distance_F_use).reset_index(drop=True)
    gene_names=at_df.index
    at_df_no_distance_label = at_df_no_distance_balance['deg']
    at_df_no_distance_balance_cor = at_df_no_distance_balance[at_df_no_distance_balance.corr().deg[
        abs(at_df_no_distance_balance.corr().deg) > 0.1].index]

    TF_classify_balance_cor = setup(data=at_df_no_distance_balance_cor,
                                    target='deg',
                                    train_size=0.7,
                                    silent=True,
                                    fold=10,
                                    normalize=True,
                                    normalize_method='zscore',
                                    feature_selection=True,
                                    pca=False,
                                    session_id=test_set_index[i],
                                    html=True)

    best_model = create_model('rf')
    prediction = predict_model(best_model)
    results=pull()
    AUC_no_distance_list.append(results.AUC.array[0])

In [None]:
# Compare AUC of models with features deleted
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
AUC_value = AUC_list+AUC_no_signal_list+AUC_no_fpkm_list+AUC_no_distance_list+AUC_no_dap_list
model=['Model']*100  + ['Delete GLK signal feature']*100+['Delete gene FPKM feature']*100+['Delete distance to peak feature']*100+ ['Delete DAP feature']*100

plot_data = {'AUC' : AUC_value, 'model':model}
plot_df = pd.DataFrame(plot_data)

sns.set_theme(style="ticks")

f, ax = plt.subplots()
ax.set_xscale("log")

sns.boxplot(x="AUC", y="model", data=plot_df,
            width=.6, palette="vlag")

sns.stripplot(x="AUC", y="model", data=plot_df,
              size=4, color=".3", linewidth=0)

ax.xaxis.grid(True)
ax.set(ylabel="")
fig=ax.get_figure()
fig.tight_layout()
fig.savefig("Model comparison.pdf",dpi=1000)

In [None]:
# Average AUC of models
Class_group=plot_df.groupby('model')
Class_group.mean()