In [1]:
import os
import pickle
import time

import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from src.mymetrics import MyMetrics
from src.ppi import PPI


def get_project_rootpath():
    """
    获取项目根目录。此函数的能力体现在，不论当前module被import到任何位置，都可以正确获取项目根目录
    :return:
    """
    path = os.path.realpath(os.curdir)
    while True:
        for subpath in os.listdir(path):
            # PyCharm项目中，'.idea'是必然存在的，且名称唯一
            if '.idea' in subpath:
                return path
        path = os.path.dirname(path)


os.chdir(get_project_rootpath())
EMB_ROOT_PATH = 'data/emb/'
PPI_PATH = 'data/network/PPI-Network.txt'
GENECOUNT_PATH = 'data/genecount.xls'
DEEPWALK_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'deepwalk')
ENCODED_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'encoded')

ppi = PPI(ppi_network_path=PPI_PATH, gene_count_path=GENECOUNT_PATH, k=6)

In [2]:
def dataloader(file_name, positive_gene: set, risklevel: dict):
    data = file_name.strip('.emb').split('_')
    param = {i[0]: i[1:] for i in data[1:4]}

    # 训练特征和label
    file_path = os.path.join(ENCODED_EMB_ROOT_PATH, file_name)
    with open(file_path, 'r') as f:
        data = [line.strip().split() for line in f.readlines()[1:]]
    X = [line[1:] for line in data]
    target = [1 if int(line[0]) in positive_gene else 0 for line in data]
    X = np.asarray(X, dtype=float)
    target = np.asarray(target, dtype=int)

    # 权重
    class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=target)
    sample_weight = [risklevel[int(line[0])] * class_weight[1]
                     if int(line[0]) in positive_gene else class_weight[0] for line in data]
    return param, X, target, sample_weight

In [3]:
ENCODED_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'encoded')
datasets = os.listdir(ENCODED_EMB_ROOT_PATH)
models_encoded = {}
result_acc = {}
result_f1 = {}
result_auc = {}
result_aupr = {}
step = 1

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'n_estimators': 1550,
    'learning_rate': 0.01,
    'max_depth': 6,
    'min_child_weight': 5,
    'gamma': 0.5,
    'subsample': 0.8,
    'colsample_bytree': 0.9,
    'reg_alpha': 0.1,
    'reg_lambda': 1,
    'seed': 420
}

classifier = xgboost.XGBClassifier(**xgb_params)

for dataset in datasets:
    start_time = time.time()
    params, X, y, weight = dataloader(dataset, ppi.positive_id, ppi.risk_level)
    X_train, X_test, y_train, y_test, weight_train, weights_test = train_test_split(X, y,
                                                                                    weight,
                                                                                    test_size=0.3,
                                                                                    random_state=420)

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    y_score = classifier.predict_proba(X_test)[:, 1]

    metric = MyMetrics(y_test, y_pred)
    acc, f1, auc, aupr = metric.evaluate(y_test, y_pred, y_score)
    result_acc[(params['d'], params['l'], params['n'])] = acc
    result_f1[(params['d'], params['l'], params['n'])] = f1
    result_auc[(params['d'], params['l'], params['n'])] = auc
    result_aupr[(params['d'], params['l'], params['n'])] = aupr
    # save the trained model
    model_name = f"{params['d']}_{params['l']}_{params['n']}_deepwalk"
    models_encoded[model_name] = {'clf': classifier, 'f1': f1, 'aur': auc, 'acc': acc, 'aupr': aupr}

    end_time = time.time()
    time_taken = end_time - start_time
    print(f"Time taken for iteration {step}: {time_taken} seconds")
    print(f"params: dim:{params['d']}, length:{params['l']}, num:{params['n']}")
    print(f"f1:{f1:.4f}, auc:{auc:.4f}, acc:{acc:.4f}, aupr:{aupr:.4f}\n")
    step += 1

# save all models in a file
with open("model/deepwalk_encoded_all_xgb_models.pkl", "wb") as f:
    pickle.dump(models_encoded, f)

Time taken for iteration 1: 70.66596388816833 seconds
params: dim:128, length:20, num:80
f1:0.7161, auc:0.7689, acc:0.6936, aupr:0.7677

Time taken for iteration 2: 69.92443704605103 seconds
params: dim:256, length:20, num:20
f1:0.7059, auc:0.7592, acc:0.6849, aupr:0.7611

Time taken for iteration 3: 68.70206809043884 seconds
params: dim:128, length:80, num:40
f1:0.7225, auc:0.7774, acc:0.7022, aupr:0.7763

Time taken for iteration 4: 68.87228107452393 seconds
params: dim:512, length:40, num:40
f1:0.7237, auc:0.7754, acc:0.7033, aupr:0.7719

Time taken for iteration 5: 67.66861009597778 seconds
params: dim:64, length:40, num:80
f1:0.7295, auc:0.7873, acc:0.7104, aupr:0.7890

Time taken for iteration 6: 70.11348295211792 seconds
params: dim:128, length:80, num:20
f1:0.7245, auc:0.7793, acc:0.7052, aupr:0.7772

Time taken for iteration 7: 67.70546007156372 seconds
params: dim:256, length:80, num:80
f1:0.7251, auc:0.7779, acc:0.7016, aupr:0.7792

Time taken for iteration 8: 67.37105798721

In [4]:
def observe_param_effect(result_dict, param):
    def get_param_values(p):
        values = set()
        for key in result_dict.keys():
            values.add(key[p])
        return list(values)

    def get_avg_param(k):
        k_items = list(filter(lambda x: x[0][0] == k, result_dict.items()))
        k_values = [x[1] for x in k_items]

        avg_k = sum(k_values) / len(k_values)
        return avg_k

    param_map = {
        'dim': 0,
        'length': 1,
        'num': 2
    }
    return {int(k): get_avg_param(k) for k in get_param_values(param_map[param])}


observe_param_effect(result_aupr, 'dim')

{128: 0.7705202330927562,
 512: 0.7745656052637147,
 64: 0.789101162643912,
 256: 0.770303549949294}

In [5]:
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
%matplotlib notebook

def log_tick_formatter(val, pos=None):
    return f"{int(2 ** val)}"


def log_divide_tick_formatter(val, pos=None):
    return f"{int(10 * (2 ** val))}"


def log2_divide_by_10(l):
    divide_list = list(map(lambda x: x / 10, l))
    log_list = list(map(lambda x: np.log2(x), divide_list))
    return log_list


def plot_3d_params_impact(result_params: dict):
    x = []
    y = []
    z = []
    c = []
    for k, v in result_params.items():
        x.append(int(k[0]))
        y.append(int(k[1]))
        z.append(int(k[2]))
        c.append(v)

    x = list(map(lambda x: np.log2(x), x))
    y = log2_divide_by_10(y)
    z = log2_divide_by_10(z)

    # Creating figure
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')

    # Add x, y, z gridlines
    ax.grid(visible=True, color='grey',
            linestyle='-.', linewidth=0.3,
            alpha=0.2)

    # Creating color map
    my_cmap = plt.get_cmap('hot')

    # Creating plot
    sctt = ax.scatter3D(x, y, z, alpha=0.8, c=c, cmap=my_cmap)

    plt.title("simple 3D scatter plot")

    ax.set_xlabel('dim', fontweight='bold')
    ax.set_ylabel('length', fontweight='bold')
    ax.set_zlabel('num', fontweight='bold')

    ax.xaxis.set_major_locator(ticker.MaxNLocator(4))
    ax.yaxis.set_major_locator(ticker.MaxNLocator(3))
    ax.zaxis.set_major_locator(ticker.MaxNLocator(3))

    # Set x, y, z axis tick formatters
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: log_tick_formatter(x)))
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: log_divide_tick_formatter(x)))
    ax.zaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: log_divide_tick_formatter(x)))

    fig.colorbar(sctt, ax=ax, shrink=0.5, aspect=5)
    plt.show()


plot_3d_params_impact(result_acc)

<IPython.core.display.Javascript object>

In [6]:
plot_3d_params_impact(result_auc)

<IPython.core.display.Javascript object>

In [10]:
plot_3d_params_impact(result_f1)

<IPython.core.display.Javascript object>

In [9]:
plot_3d_params_impact(result_aupr)

<IPython.core.display.Javascript object>

In [14]:
import dill
with open('log/xgboost_result.pkl','wb') as f:
    dill.dump(result_acc,f)
    dill.dump(result_aupr,f)
    dill.dump(result_f1,f)
    dill.dump(result_auc,f)

In [6]:
# datasets = os.listdir(DEEPWALK_EMB_ROOT_PATH)
#
# models = {}
# result_acc = {}
# result_f1 = {}
# result_auc = {}
# result_aupr = {}
# step = 1
#
# xgb_params = {
#     'objective': 'binary:logistic',
#     'eval_metric': 'auc',
#     'n_estimators': 1550,
#     'learning_rate': 0.01,
#     'max_depth': 6,
#     'min_child_weight': 5,
#     'gamma': 0.5,
#     'subsample': 0.8,
#     'colsample_bytree': 0.9,
#     'reg_alpha': 0.1,
#     'reg_lambda': 1,
#     'seed': 420
# }
# classifier = xgboost.XGBClassifier(**xgb_params)
#
# for dataset in datasets:
#     start_time = time.time()
#     params, X, y, weight = dataloader(dataset, ppi.positive_id, ppi.risk_level)
#     X_train, X_test, y_train, y_test, weight_train, weights_test = train_test_split(X, y,
#                                                                                     weight,
#                                                                                     test_size=0.3,
#                                                                                     random_state=420)
#
#     classifier.fit(X_train, y_train)
#     y_pred = classifier.predict(X_test)
#     y_score = classifier.predict_proba(X_test)[:, 1]
#
#     metric = MyMetrics(y_test, y_pred)
#     acc, f1, auc, aupr = metric.evaluate(y_test, y_pred, y_score)
#     result_acc[(params['d'], params['l'], params['n'])] = acc
#     result_f1[(params['d'], params['l'], params['n'])] = f1
#     result_auc[(params['d'], params['l'], params['n'])] = auc
#     result_aupr[(params['d'], params['l'], params['n'])] = aupr
#     # save the trained model
#     model_name = f"{params['d']}_{params['l']}_{params['n']}_deepwalk"
#     models[model_name] = {'clf': classifier, 'f1': f1, 'aur': auc, 'acc': acc, 'aupr': aupr}
#     end_time = time.time()
#     time_taken = end_time - start_time
#
#     print(f"Time taken for iteration {step}: {time_taken} seconds")
#     print(f"iteration {step} params: dim:{params['d']}, length:{params['l']}, num:{params['n']}")
#     print(f"f1:{f1:.4f}, auc:{auc:.4f}, acc:{acc:.4f}, aupr:{aupr:.4f}\n")
#     step += 1
#
# # save all models in a file
# with open("model/deepwalk_all_xgb_models.pkl", "wb") as f:
#     pickle.dump(models, f)

Time taken for iteration 1: 83.20530414581299 seconds
iteration 1 params: dim:64, length:40, num:20
f1:0.7315, auc:0.7896, acc:0.7143, aupr:0.7903

Time taken for iteration 2: 154.36654114723206 seconds
iteration 2 params: dim:128, length:80, num:20
f1:0.7276, auc:0.7876, acc:0.7106, aupr:0.7884

Time taken for iteration 3: 146.87323784828186 seconds
iteration 3 params: dim:128, length:20, num:20
f1:0.7132, auc:0.7698, acc:0.6967, aupr:0.7777

Time taken for iteration 4: 78.93629693984985 seconds
iteration 4 params: dim:64, length:20, num:80
f1:0.7243, auc:0.7830, acc:0.7047, aupr:0.7865

Time taken for iteration 5: 145.19131898880005 seconds
iteration 5 params: dim:128, length:40, num:40
f1:0.7234, auc:0.7840, acc:0.7036, aupr:0.7872

Time taken for iteration 6: 81.8835711479187 seconds
iteration 6 params: dim:64, length:80, num:40
f1:0.7300, auc:0.7883, acc:0.7081, aupr:0.7913

Time taken for iteration 7: 148.77970385551453 seconds
iteration 7 params: dim:128, length:40, num:80
f1:0.