In [5]:
import os
import pickle
import time

import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from src.mymetrics import MyMetrics
from src.ppi import PPI


def get_project_rootpath():
    """
    获取项目根目录。此函数的能力体现在，不论当前module被import到任何位置，都可以正确获取项目根目录
    :return:
    """
    path = os.path.realpath(os.curdir)
    while True:
        for subpath in os.listdir(path):
            # PyCharm项目中，'.idea'是必然存在的，且名称唯一
            if '.idea' in subpath:
                return path
        path = os.path.dirname(path)


os.chdir(get_project_rootpath())
EMB_ROOT_PATH = 'data/emb/'
PPI_PATH = 'data/network/PPI-Network.txt'
GENECOUNT_PATH = 'data/genecount.xls'
DEEPWALK_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'deepwalk')
ENCODED_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'encoded')

ppi = PPI(ppi_network_path=PPI_PATH, gene_count_path=GENECOUNT_PATH, k=6)

In [6]:
def dataloader(file_name, positive_gene: set, risklevel: dict):
    data = file_name.strip('.emb').split('_')
    param = {i[0]: i[1:] for i in data[1:4]}

    # 训练特征和label
    file_path = os.path.join(ENCODED_EMB_ROOT_PATH, file_name)
    with open(file_path, 'r') as f:
        data = [line.strip().split() for line in f.readlines()[1:]]
    X = [line[1:] for line in data]
    target = [1 if int(line[0]) in positive_gene else 0 for line in data]
    X = np.asarray(X, dtype=float)
    target = np.asarray(target, dtype=int)

    # 权重
    class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=target)
    sample_weight = [risklevel[int(line[0])] * class_weight[1]
                     if int(line[0]) in positive_gene else class_weight[0] for line in data]
    return param, X, target, sample_weight

In [None]:
ENCODED_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'encoded')
datasets = os.listdir(ENCODED_EMB_ROOT_PATH)
models_encoded = {}
result_acc = {}
result_f1 = {}
result_auc = {}
result_aupr = {}
step = 1

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'n_estimators': 1550,
    'learning_rate': 0.01,
    'max_depth': 6,
    'min_child_weight': 5,
    'gamma': 0.5,
    'subsample': 0.8,
    'colsample_bytree': 0.9,
    'reg_alpha': 0.1,
    'reg_lambda': 1,
    'seed': 420
}

for dataset in datasets:
    start_time = time.time()
    params, X, y, weight = dataloader(dataset, ppi.positive_id, ppi.risk_level)
    X_train, X_test, y_train, y_test, weight_train, weights_test = train_test_split(X, y,
                                                                                    weight,
                                                                                    test_size=0.3,
                                                                                    random_state=420)

    classifier = xgboost.XGBClassifier(**xgb_params)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    y_score = classifier.predict_proba(X_test)[:, 1]

    metric = MyMetrics(y_test, y_pred)
    acc, f1, auc, aupr = metric.evaluate(y_test, y_pred, y_score)
    result_acc[(params['d'], params['l'], params['n'])] = acc
    result_f1[(params['d'], params['l'], params['n'])] = f1
    result_auc[(params['d'], params['l'], params['n'])] = auc
    result_aupr[(params['d'], params['l'], params['n'])] = aupr
    # save the trained model
    model_name = f"{params['d']}_{params['l']}_{params['n']}_deepwalk"
    models_encoded[model_name] = {'clf': classifier, 'f1': f1, 'aur': auc, 'acc': acc, 'aupr': aupr}

    end_time = time.time()
    time_taken = end_time - start_time
    print(f"Time taken for iteration {step}: {time_taken} seconds")
    print(f"params: dim:{params['d']}, length:{params['l']}, num:{params['n']}")
    print(f"f1:{f1:.4f}, auc:{auc:.4f}, acc:{acc:.4f}, aupr:{aupr:.4f}\n")
    step += 1

# save all models in a file
with open("model/deepwalk_encoded_all_xgb_models.pkl", "wb") as f:
    pickle.dump(models_encoded, f)

In [None]:
def observe_param_effect(result_dict, param):
    def get_param_values(p):
        values = set()
        for key in result_dict.keys():
            values.add(key[p])
        return list(values)

    def get_avg_param(k):
        k_items = list(filter(lambda x: x[0][0] == k, result_dict.items()))
        k_values = [x[1] for x in k_items]

        avg_k = sum(k_values) / len(k_values)
        return avg_k

    param_map = {
        'dim': 0,
        'length': 1,
        'num': 2
    }
    return {int(k): get_avg_param(k) for k in get_param_values(param_map[param])}


observe_param_effect(result_aupr, 'dim')

In [None]:
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
%matplotlib notebook

def log_tick_formatter(val, pos=None):
    return f"{int(2 ** val)}"


def log_divide_tick_formatter(val, pos=None):
    return f"{int(10 * (2 ** val))}"


def log2_divide_by_10(l):
    divide_list = list(map(lambda x: x / 10, l))
    log_list = list(map(lambda x: np.log2(x), divide_list))
    return log_list


def plot_3d_params_impact(result_params: dict):
    x = []
    y = []
    z = []
    c = []
    for k, v in result_params.items():
        x.append(int(k[0]))
        y.append(int(k[1]))
        z.append(int(k[2]))
        c.append(v)

    x = list(map(lambda x: np.log2(x), x))
    y = log2_divide_by_10(y)
    z = log2_divide_by_10(z)

    # Creating figure
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')

    # Add x, y, z gridlines
    ax.grid(visible=True, color='grey',
            linestyle='-.', linewidth=0.3,
            alpha=0.2)

    # Creating color map
    my_cmap = plt.get_cmap('hot')

    # Creating plot
    sctt = ax.scatter3D(x, y, z, alpha=0.8, c=c, cmap=my_cmap)

    plt.title("simple 3D scatter plot")

    ax.set_xlabel('dim', fontweight='bold')
    ax.set_ylabel('length', fontweight='bold')
    ax.set_zlabel('num', fontweight='bold')

    ax.xaxis.set_major_locator(ticker.MaxNLocator(4))
    ax.yaxis.set_major_locator(ticker.MaxNLocator(3))
    ax.zaxis.set_major_locator(ticker.MaxNLocator(3))

    # Set x, y, z axis tick formatters
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: log_tick_formatter(x)))
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: log_divide_tick_formatter(x)))
    ax.zaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: log_divide_tick_formatter(x)))

    fig.colorbar(sctt, ax=ax, shrink=0.5, aspect=5)
    plt.show()


plot_3d_params_impact(result_acc)

In [None]:
plot_3d_params_impact(result_auc)

In [None]:
plot_3d_params_impact(result_f1)

In [None]:
plot_3d_params_impact(result_aupr)

In [None]:
import dill

with open('log/result_deepwalk.pkl', 'wb') as f:
    dill.dump(result_acc, f)
    dill.dump(result_aupr, f)
    dill.dump(result_f1, f)
    dill.dump(result_auc, f)

In [None]:
with open('log/result_deepwalk.pkl', 'rb') as f:
    result_acc = dill.load(f)
    result_aupr = dill.load(f)
    result_f1 = dill.load(f)
    result_auc = dill.load(f)
print(result_acc, result_aupr, result_f1, result_auc)

In [None]:
def find_max(data_dict):
    max_value = float('-inf')  # 初始化最大值为负无穷
    max_pair = None  # 初始化最大值键值对为 None

    for key, value in data_dict.items():
        if value > max_value:
            max_value = value
            max_pair = (key, value)

    return max_pair


print(find_max(result_acc))
print(find_max(result_f1))
print(find_max(result_auc))
print(find_max(result_aupr))

In [7]:
with open('model/deepwalk_encoded_all_xgb_models.pkl', 'rb') as f:
    models = pickle.load(f)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score, average_precision_score

result_acc = {}
result_f1 = {}
result_auc = {}
result_aupr = {}
step = 1

scoring = {
    'f1': make_scorer(f1_score),
    'auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score),
    'aupr': make_scorer(average_precision_score)
}

for dataset in datasets:
    start_time = time.time()
    params, X, y, weight = dataloader(dataset, ppi.positive_id, ppi.risk_level)
    X_train, X_test, y_train, y_test, weight_train, weights_test = train_test_split(X, y,
                                                                                    weight,
                                                                                    test_size=0.3,
                                                                                    random_state=420)
    key_index = params['d'] + '_' + params['l'] + '_' + params['n'] + '_deepwalk'
    classifier = models[key_index]
    cv_scores = cross_val_score(classifier, X, y, cv=5, scoring=scoring)
    result_acc[(params['d'], params['l'], params['n'])] = cv_scores['accuracy']
    result_f1[(params['d'], params['l'], params['n'])] = cv_scores['f1']
    result_auc[(params['d'], params['l'], params['n'])] = cv_scores['auc']
    result_aupr[(params['d'], params['l'], params['n'])] = cv_scores['aupr']

    end_time = time.time()
    time_taken = end_time - start_time
    print(f"Time taken for iteration {step}: {time_taken} seconds")
    print(f"params: dim:{params['d']}, length:{params['l']}, num:{params['n']}")
    print(
        f"f1:{cv_scores['f1']:.4f}, auc:{cv_scores['auc']:.4f}, acc:{cv_scores['accuracy']:.4f}, aupr:{cv_scores['aupr']:.4f}\n")
    step += 1#%%
import os
import pickle
import time

import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from src.mymetrics import MyMetrics
from src.ppi import PPI


def get_project_rootpath():
    """
    获取项目根目录。此函数的能力体现在，不论当前module被import到任何位置，都可以正确获取项目根目录
    :return:
    """
    path = os.path.realpath(os.curdir)
    while True:
        for subpath in os.listdir(path):
            # PyCharm项目中，'.idea'是必然存在的，且名称唯一
            if '.idea' in subpath:
                return path
        path = os.path.dirname(path)


os.chdir(get_project_rootpath())
EMB_ROOT_PATH = 'data/emb/'
PPI_PATH = 'data/network/PPI-Network.txt'
GENECOUNT_PATH = 'data/genecount.xls'
DEEPWALK_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'deepwalk')
ENCODED_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'encoded')

ppi = PPI(ppi_network_path=PPI_PATH, gene_count_path=GENECOUNT_PATH, k=6)

In [None]:
def dataloader(file_name, positive_gene: set, risklevel: dict):
    data = file_name.strip('.emb').split('_')
    param = {i[0]: i[1:] for i in data[1:4]}

    # 训练特征和label
    file_path = os.path.join(ENCODED_EMB_ROOT_PATH, file_name)
    with open(file_path, 'r') as f:
        data = [line.strip().split() for line in f.readlines()[1:]]
    X = [line[1:] for line in data]
    target = [1 if int(line[0]) in positive_gene else 0 for line in data]
    X = np.asarray(X, dtype=float)
    target = np.asarray(target, dtype=int)

    # 权重
    class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=target)
    sample_weight = [risklevel[int(line[0])] * class_weight[1]
                     if int(line[0]) in positive_gene else class_weight[0] for line in data]
    return param, X, target, sample_weight

In [None]:
ENCODED_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'encoded')
datasets = os.listdir(ENCODED_EMB_ROOT_PATH)
models_encoded = {}
result_acc = {}
result_f1 = {}
result_auc = {}
result_aupr = {}
step = 1

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'n_estimators': 1550,
    'learning_rate': 0.01,
    'max_depth': 6,
    'min_child_weight': 5,
    'gamma': 0.5,
    'subsample': 0.8,
    'colsample_bytree': 0.9,
    'reg_alpha': 0.1,
    'reg_lambda': 1,
    'seed': 420
}

for dataset in datasets:
    start_time = time.time()
    params, X, y, weight = dataloader(dataset, ppi.positive_id, ppi.risk_level)
    X_train, X_test, y_train, y_test, weight_train, weights_test = train_test_split(X, y,
                                                                                    weight,
                                                                                    test_size=0.3,
                                                                                    random_state=420)

    classifier = xgboost.XGBClassifier(**xgb_params)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    y_score = classifier.predict_proba(X_test)[:, 1]

    metric = MyMetrics(y_test, y_pred)
    acc, f1, auc, aupr = metric.evaluate(y_test, y_pred, y_score)
    result_acc[(params['d'], params['l'], params['n'])] = acc
    result_f1[(params['d'], params['l'], params['n'])] = f1
    result_auc[(params['d'], params['l'], params['n'])] = auc
    result_aupr[(params['d'], params['l'], params['n'])] = aupr
    # save the trained model
    model_name = f"{params['d']}_{params['l']}_{params['n']}_deepwalk"
    models_encoded[model_name] = {'clf': classifier, 'f1': f1, 'aur': auc, 'acc': acc, 'aupr': aupr}

    end_time = time.time()
    time_taken = end_time - start_time
    print(f"Time taken for iteration {step}: {time_taken} seconds")
    print(f"params: dim:{params['d']}, length:{params['l']}, num:{params['n']}")
    print(f"f1:{f1:.4f}, auc:{auc:.4f}, acc:{acc:.4f}, aupr:{aupr:.4f}\n")
    step += 1

# save all models in a file
with open("model/deepwalk_encoded_all_xgb_models.pkl", "wb") as f:
    pickle.dump(models_encoded, f)

In [None]:
def observe_param_effect(result_dict, param):
    def get_param_values(p):
        values = set()
        for key in result_dict.keys():
            values.add(key[p])
        return list(values)

    def get_avg_param(k):
        k_items = list(filter(lambda x: x[0][0] == k, result_dict.items()))
        k_values = [x[1] for x in k_items]

        avg_k = sum(k_values) / len(k_values)
        return avg_k

    param_map = {
        'dim': 0,
        'length': 1,
        'num': 2
    }
    return {int(k): get_avg_param(k) for k in get_param_values(param_map[param])}


observe_param_effect(result_aupr, 'dim')

In [None]:
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
%matplotlib notebook

def log_tick_formatter(val, pos=None):
    return f"{int(2 ** val)}"


def log_divide_tick_formatter(val, pos=None):
    return f"{int(10 * (2 ** val))}"


def log2_divide_by_10(l):
    divide_list = list(map(lambda x: x / 10, l))
    log_list = list(map(lambda x: np.log2(x), divide_list))
    return log_list


def plot_3d_params_impact(result_params: dict):
    x = []
    y = []
    z = []
    c = []
    for k, v in result_params.items():
        x.append(int(k[0]))
        y.append(int(k[1]))
        z.append(int(k[2]))
        c.append(v)

    x = list(map(lambda x: np.log2(x), x))
    y = log2_divide_by_10(y)
    z = log2_divide_by_10(z)

    # Creating figure
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')

    # Add x, y, z gridlines
    ax.grid(visible=True, color='grey',
            linestyle='-.', linewidth=0.3,
            alpha=0.2)

    # Creating color map
    my_cmap = plt.get_cmap('hot')

    # Creating plot
    sctt = ax.scatter3D(x, y, z, alpha=0.8, c=c, cmap=my_cmap)

    plt.title("simple 3D scatter plot")

    ax.set_xlabel('dim', fontweight='bold')
    ax.set_ylabel('length', fontweight='bold')
    ax.set_zlabel('num', fontweight='bold')

    ax.xaxis.set_major_locator(ticker.MaxNLocator(4))
    ax.yaxis.set_major_locator(ticker.MaxNLocator(3))
    ax.zaxis.set_major_locator(ticker.MaxNLocator(3))

    # Set x, y, z axis tick formatters
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: log_tick_formatter(x)))
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: log_divide_tick_formatter(x)))
    ax.zaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: log_divide_tick_formatter(x)))

    fig.colorbar(sctt, ax=ax, shrink=0.5, aspect=5)
    plt.show()


plot_3d_params_impact(result_acc)

In [None]:
plot_3d_params_impact(result_auc)

In [None]:
plot_3d_params_impact(result_f1)

In [None]:
plot_3d_params_impact(result_aupr)

In [None]:
import dill

with open('log/result_deepwalk.pkl', 'wb') as f:
    dill.dump(result_acc, f)
    dill.dump(result_aupr, f)
    dill.dump(result_f1, f)
    dill.dump(result_auc, f)

In [None]:
with open('log/result_deepwalk.pkl', 'rb') as f:
    result_acc = dill.load(f)
    result_aupr = dill.load(f)
    result_f1 = dill.load(f)
    result_auc = dill.load(f)
print(result_acc, result_aupr, result_f1, result_auc)

In [None]:
def find_max(data_dict):
    max_value = float('-inf')  # 初始化最大值为负无穷
    max_pair = None  # 初始化最大值键值对为 None

    for key, value in data_dict.items():
        if value > max_value:
            max_value = value
            max_pair = (key, value)

    return max_pair


print(find_max(result_acc))
print(find_max(result_f1))
print(find_max(result_auc))
print(find_max(result_aupr))

In [8]:
with open('model/deepwalk_encoded_all_xgb_models.pkl', 'rb') as f:
    models = pickle.load(f)

In [9]:
models

{'128_20_80_deepwalk': {'clf': XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='auc', gamma=0.5, gpu_id=-1, grow_policy='depthwise',
                importance_type=None, interaction_constraints='',
                learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
                max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=5,
                missing=nan, monotone_constraints='()', n_estimators=1550,
                n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=420,
                reg_alpha=0.1, reg_lambda=1, ...),
  'f1': 0.7161271841879117,
  'aur': 0.7689163559262312,
  'acc': 0.6935683364254793,
  'aupr': 0.7677139489585587},
 '256_20_20_deepwalk': {'clf': XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                colsample_bylevel=1,

In [15]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score, average_precision_score

ENCODED_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'encoded_deepwalk')
datasets = os.listdir(ENCODED_EMB_ROOT_PATH)

result_acc = {}
result_f1 = {}
result_auc = {}
result_aupr = {}
step = 1

scoring = {
    'f1': make_scorer(f1_score),
    'auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score),
    'aupr': make_scorer(average_precision_score)
}

for dataset in datasets:
    start_time = time.time()
    params, X, y, weight = dataloader(dataset, ppi.positive_id, ppi.risk_level)
    X_train, X_test, y_train, y_test, weight_train, weights_test = train_test_split(X, y,
                                                                                    weight,
                                                                                    test_size=0.3,
                                                                                    random_state=420)
    key_index = params['d'] + '_' + params['l'] + '_' + params['n'] + '_deepwalk'
    classifier = models[key_index]['clf']
    cv_scores = cross_validate(classifier, X, y, cv=5, scoring=scoring)
    result_acc[(params['d'], params['l'], params['n'])] = cv_scores['accuracy']
    result_f1[(params['d'], params['l'], params['n'])] = cv_scores['f1']
    result_auc[(params['d'], params['l'], params['n'])] = cv_scores['auc']
    result_aupr[(params['d'], params['l'], params['n'])] = cv_scores['aupr']

    end_time = time.time()
    time_taken = end_time - start_time
    print(f"Time taken for iteration {step}: {time_taken} seconds")
    print(f"params: dim:{params['d']}, length:{params['l']}, num:{params['n']}")
    print(
        f"f1:{cv_scores['f1']:.4f}, auc:{cv_scores['auc']:.4f}, acc:{cv_scores['accuracy']:.4f}, aupr:{cv_scores['aupr']:.4f}\n")
    step += 1


KeyboardInterrupt: 