In [1]:
import os

import numpy as np
from sklearn.utils.class_weight import compute_class_weight

from src.myclassifier import MyClassifier
from src.mymetrics import MyMetrics
from src.ppi import PPI

In [2]:
def get_project_rootpath():
    """
    获取项目根目录。此函数的能力体现在，不论当前module被import到任何位置，都可以正确获取项目根目录
    :return:
    """
    path = os.path.realpath(os.curdir)
    while True:
        for subpath in os.listdir(path):
            # PyCharm项目中，'.idea'是必然存在的，且名称唯一
            if '.idea' in subpath:
                return path
        path = os.path.dirname(path)


os.chdir(get_project_rootpath())

In [3]:
EMB_ROOT_PATH = 'data/emb/'
PPI_PATH = 'data/network/PPI-Network.txt'
DEEPWALK_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'deepwalk')

In [4]:
def dataloader(file_name, positive_gene: set, risklevel: dict):
    # file_name: emb files
    # positive_gene: 正例的gene
    # risklevel 出现了多少次的gene为positive gene

    data = file_name.strip('.emb').split('_')
    param = {i[0]: i[1:] for i in data[1:]}

    # 训练特征和label
    file_path = os.path.join(DEEPWALK_EMB_ROOT_PATH, file_name)
    with open(file_path, 'r') as f:
        data = [line.strip().split() for line in f.readlines()[1:]]
    X = [line[1:] for line in data]
    target = [1 if int(line[0]) in positive_gene else 0 for line in data]
    X = np.asarray(X, dtype=float)
    target = np.asarray(target, dtype=int)

    # 权重
    class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=target)
    sample_weights = [risklevel[int(line[0])] * class_weight[1]
                      if int(line[0]) in positive_gene else class_weight[0] for line in data]
    return param, X, target, sample_weights


In [None]:
from sklearn.model_selection import train_test_split
import pickle

GENECOUNT_PATH = 'data/genecount.xls'

ppi = PPI(ppi_network_path=PPI_PATH, gene_count_path=GENECOUNT_PATH, k=6)
datasets = os.listdir(DEEPWALK_EMB_ROOT_PATH)
models = {}
result_acc = {}
result_f1 = {}
result_auc = {}
result_aupr = {}

for dataset in datasets:
    params, X, y, weight = dataloader(dataset, ppi.positive_id, ppi.risk_level)
    X_train, X_test, y_train, y_test, weight_train, weights_test = train_test_split(X, y,
                                                                                    weight,
                                                                                    test_size=0.3)
    classifier = MyClassifier()
    classifier.train(X_train, y_train, weight=weight_train)
    y_pred_dict = classifier.predict(X_test)
    y_score_dict = classifier.predict_proba(X_test)

    for clf_name, y_pred in y_pred_dict.items():
        metric = MyMetrics(y_test, y_pred)
        acc, f1, auc, aupr = metric.evaluate(y_test, y_pred, y_score_dict[clf_name])
        result_acc[(clf_name, params['d'], params['l'], params['n'])] = acc
        result_f1[(clf_name, params['d'], params['l'], params['n'])] = f1
        result_auc[(clf_name, params['d'], params['l'], params['n'])] = auc
        result_aupr[(clf_name, params['d'], params['l'], params['n'])] = aupr
        # save the trained model

    model_name = f"{params['d']}_{params['l']}_{params['n']}_deepwalk"
    models[model_name] = classifier

# save all models in a file
with open("model/deepwalk_all_models.pkl", "wb") as f:
    pickle.dump(models, f)

In [8]:
# 加载模型
from sklearn.model_selection import train_test_split
import pickle
import time

# 加载所有模型和结果
with open("model/deepwalk_all_models.pkl", "rb") as f:
    all_models = pickle.load(f)

GENECOUNT_PATH = 'data/genecount.xls'

ppi = PPI(ppi_network_path=PPI_PATH, gene_count_path=GENECOUNT_PATH, k=6)
datasets = os.listdir(DEEPWALK_EMB_ROOT_PATH)
models = {}
result_acc = {}
result_f1 = {}
result_auc = {}
result_aupr = {}
step = 1
for dataset in datasets:
    start_time = time.time()
    params, X, y, weight = dataloader(dataset, ppi.positive_id, ppi.risk_level)
    X_train, X_test, y_train, y_test, weight_train, weights_test = train_test_split(X, y,
                                                                                    weight,
                                                                                    test_size=0.3)

    for clf_name in ['svm', 'rf', 'nb']:
        cur_model_name = f"{clf_name}_{params['d']}_{params['l']}_{params['n']}_deepwalk"
    classifier = all_models[cur_model_name]
    y_pred_dict = classifier.predict(X_test)
    y_score_dict = classifier.predict_proba(X_test)
    for clf_name, y_pred in y_pred_dict.items():
        metric = MyMetrics(y_test, y_pred)
        acc, f1, auc, aupr = metric.evaluate(y_test, y_pred, y_score_dict[clf_name])
        result_acc[(clf_name, params['d'], params['l'], params['n'])] = acc
        result_f1[(clf_name, params['d'], params['l'], params['n'])] = f1
        result_auc[(clf_name, params['d'], params['l'], params['n'])] = auc
        result_aupr[(clf_name, params['d'], params['l'], params['n'])] = aupr

    end_time = time.time()
    time_taken = end_time - start_time
    print(f"Time taken for iteration {step}: {time_taken} seconds")
    step += 1

Finished.
Finished.
querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-11703...done.
Finished.
Time taken for iteration 1: 21.437336206436157 seconds
Time taken for iteration 2: 21.69511389732361 seconds
Time taken for iteration 3: 21.91542077064514 seconds
Time taken for iteration 4: 20.74600911140442 seconds
Time taken for iteration 5: 21.54145312309265 seconds
Time taken for iteration 6: 21.67549705505371 seconds
Time taken for iteration 7: 25.806737184524536 seconds
Time taken for iteration 8: 22.12010383605957 seconds
Time taken for iteration 9: 21.22550392150879 seconds
Time taken for iteration 10: 22.224627017974854 seconds
Time taken for iteration 11: 22.129969835281372 seconds
Time taken for iteration 12: 21.470248937606

In [9]:
def observe_param_effect(result_dict, method, param):
    def get_param_values(p):
        values = set()
        for key in result_dict.keys():
            values.add(key[p])
        return list(values)

    def get_avg_param(k):
        k_items = list(filter(lambda x: x[0][0] == method and x[0][1] == k, result_dict.items()))
        k_values = [x[1] for x in k_items]

        avg_k = sum(k_values) / len(k_values)
        return avg_k

    param_map = {
        'dim': 1,
        'length': 2,
        'num': 3
    }
    return {int(k): get_avg_param(k) for k in get_param_values(param_map[param])}


observe_param_effect(result_acc, 'svm', 'dim')

{128: 0.898886827458256,
 256: 0.8676217961932248,
 64: 0.9022710094138665,
 512: 0.8581220366934653}

In [None]:
import matplotlib.pyplot as plt


def result_plotting(performance_indicator):
    x = []
    y = []
    z = []
    for k in performance_indicator.keys():
        a, b, c = k
        x.append(int(a))
        y.append(int(b))
        z.append(int(c))

    # Creating figure
    fig = plt.figure(figsize=(10, 6))
    ax = plt.axes(projection="3d")

    # Add x, y gridlines
    ax.grid(visible=True, color='grey',
            linestyle='-.', linewidth=0.3,
            alpha=0.2)

    # Creating color map 设置配色图
    my_cmap = plt.get_cmap('hsv')

    # Creating plot
    sctt = ax.scatter3D(x, y, z,
                        alpha=0.8,
                        c=list(performance_indicator.values()),
                        cmap=my_cmap)

    plt.title("simple 3D scatter plot")

    ax.set_xlabel('dim', fontweight='bold')
    ax.set_ylabel('length', fontweight='bold')
    ax.set_zlabel('num', fontweight='bold')
    fig.colorbar(sctt, ax=ax, shrink=0.5, aspect=5)

    # show plot
    plt.show()

# result_plotting(acc_nm)