In [1]:
import os

import numpy as np
from sklearn.utils.class_weight import compute_class_weight

from src.myclassifier import MyClassifier
from src.mymetrics import MyMetric
from src.ppi import PPI

In [2]:
def get_project_rootpath():
    """
    获取项目根目录。此函数的能力体现在，不论当前module被import到任何位置，都可以正确获取项目根目录
    :return:
    """
    path = os.path.realpath(os.curdir)
    while True:
        for subpath in os.listdir(path):
            # PyCharm项目中，'.idea'是必然存在的，且名称唯一
            if '.idea' in subpath:
                return path
        path = os.path.dirname(path)


os.chdir(get_project_rootpath())

In [3]:
EMB_ROOT_PATH = 'data/emb/'
PPI_PATH = 'data/network/PPI-Network.txt'
LINE_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'line')

In [4]:
def dataloader(file_name, positive_gene: set, risklevel: dict):
    # file_name: emb files
    # positive_gene: 正例的gene
    # risklevel 出现了多少次的gene为positive gene

    data = file_name.strip('.emb').split('_')
    param = {i[0]: i[1:] for i in data[1:]}

    # 训练特征和label
    file_path = os.path.join(LINE_EMB_ROOT_PATH, file_name)
    with open(file_path, 'r') as f:
        data = [line.strip().split() for line in f.readlines()[1:]]
    X = [line[1:] for line in data]
    target = [1 if int(line[0]) in positive_gene else 0 for line in data]
    X = np.asarray(X, dtype=float)
    target = np.asarray(target, dtype=int)

    # 权重
    class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=target)
    sample_weights = [risklevel[int(line[0])] * class_weight[1]
                      if int(line[0]) in positive_gene else class_weight[0] for line in data]
    return param, X, target, sample_weights


In [7]:
from sklearn.model_selection import train_test_split
import pickle

GENECOUNT_PATH = 'data/genecount.xls'

ppi = PPI(ppi_network_path=PPI_PATH, gene_count_path=GENECOUNT_PATH, k=6)
datasets = os.listdir(LINE_EMB_ROOT_PATH)
models = {}
result_acc = {}
result_f1 = {}
result_auc = {}
result_aupr = {}

for dataset in datasets:
    params, X, y, weight = dataloader(dataset, ppi.positive_id, ppi.risk_level)
    X_train, X_test, y_train, y_test, weight_train, weights_test = train_test_split(X, y,
                                                                                    weight,
                                                                                    test_size=0.3)
    classifier = MyClassifier()
    classifier.train(X_train, y_train, weight=weight_train)
    y_pred_dict = classifier.predict(X_test)
    y_score_dict = classifier.predict_proba(X_test)
    for clf_name, y_pred in y_pred_dict.items():
        metric = MyMetric(y_test, y_pred)
        acc, f1, auc, aupr = metric.evaluate(y_test, y_pred, y_score_dict[clf_name])
        result_acc[(clf_name, params['d'], params['l'], params['n'])] = acc
        result_f1[(clf_name, params['d'], params['l'], params['n'])] = f1
        result_auc[(clf_name, params['d'], params['l'], params['n'])] = auc
        result_aupr[(clf_name, params['d'], params['l'], params['n'])] = aupr
        # save the trained model
        model_name = f"{dataset}_{clf_name}_{params['d']}_{params['l']}_{params['n']}"
        with open(f"{model_name}.pkl", "wb") as f:
            pickle.dump(classifier, f)

        # add the trained model to models dictionary
        models[model_name] = classifier

# save all models in a file
with open("all_models_line.pkl", "wb") as f:
    pickle.dump(models, f)

Finished.
Finished.
querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-11703...done.
Finished.


KeyboardInterrupt: 