In [1]:
import os

import numpy as np
from sklearn.utils.class_weight import compute_class_weight

from src import ppi


In [2]:
def get_project_rootpath():
    path = os.path.realpath(os.curdir)
    while True:
        for subpath in os.listdir(path):
            # PyCharm项目中，'.idea'是必然存在的，且名称唯一
            if '.idea' in subpath:
                return path
        path = os.path.dirname(path)


os.chdir(get_project_rootpath())
# 文件路径
PPI_PATH = 'data/network/PPI-Network.txt'
CANDIDATE_NAME_PATH = 'data/candidate_genes.txt'
CANDIDATE_ID_PATH = 'data/candidate_id.txt'
GENECOUNT_PATH = 'data/genecount.xls'
EMB_ROOT_PATH = 'data/emb/'
LINE_EMB_ROOT_PATH = os.path.join(EMB_ROOT_PATH, 'line')

In [3]:
nx_ppi_network, ppi_id_list = ppi.get_ppi_network(PPI_PATH)

In [4]:
def parse_line_emb(file_name, positive_gene_id_set, risklevel):
    data = file_name.strip().split('_')
    # 计算维度
    dim = int(data[1].rstrip('.emb')[1:])

    # 训练特征和label
    file_path = os.path.join(LINE_EMB_ROOT_PATH, file_name)
    with open(file_path, 'r') as f:
        data = [line.strip().split() for line in f.readlines()[1:]]
    X = [line[1:] for line in data]
    target = [1 if int(line[0]) in positive_gene_id_set else 0 for line in data]
    X = np.asarray(X, dtype=float)
    target = np.asarray(target, dtype=int)

    # 权重
    class_weight = compute_class_weight(class_weight='balanced', classes=[0, 1], y=target)
    keys = risklevel.keys()
    sample_weights = [risklevel[int(line[0])] * class_weight[1]
                      if int(line[0]) in keys else class_weight[0] for line in data]
    return dim, X, target, sample_weights


In [33]:
from sklearn.model_selection import train_test_split
from sklearn import metrics


class Cmodel():
    def __init__(self, features, labels, sample_weights):
        self.features = features
        self.labels = labels
        self.model = None
        self.features_train, self.features_test, self.labels_train, self.labels_test, self.weights_train, self.weights_test = train_test_split(
            features, labels, sample_weights, test_size=0.3)

    def set_model(self, model):
        self.model = model

    def training(self):
        self.model.fit(self.features_train, self.labels_train, sample_weight=self.weights_train)
        return

    def evaluation(self):
        y_pred = self.model.predict(self.features_test)
        y_score = self.model.predict_proba(self.features_test)[:, 1]

        # 计算准确率
        acc = metrics.accuracy_score(y_pred=y_pred, y_true=self.labels_test)
        # 计算精确率和召回率
        precision = metrics.precision_score(y_true=self.labels_test, y_pred=y_pred)
        recall = metrics.recall_score(y_true=self.labels_test, y_pred=y_pred)
        # 计算F1分数和AUC
        f1 = metrics.f1_score(y_true=self.labels_test, y_pred=y_pred)
        auc = metrics.roc_auc_score(y_true=self.labels_test, y_score=y_score)
        # 混淆矩阵
        confusion_matrix = metrics.confusion_matrix(y_true=self.labels_test, y_pred=y_pred)
        # 结果报告
        report = metrics.classification_report(y_true=self.labels_test, y_pred=y_pred, digits=4)

        return acc, precision, recall, f1, auc, confusion_matrix, report

In [34]:
positive_gene_id_set, risk_level = ppi.set_candidate_gene(GENECOUNT_PATH, k=6)

line_filenames = os.listdir(LINE_EMB_ROOT_PATH)
for name in line_filenames:
    dim, X, y, sample_weights = parse_line_emb(name, positive_gene_id_set, risk_level)
    if dim != 512:
        cmodel = Cmodel(X, y, sample_weights)

In [63]:
from sklearn.ensemble import RandomForestClassifier

param = {"n_estimators": [10, 15, 20, 30, 40, 50, 100],
         "max_depth": [10, 15, 20, 25, 30, 40, 50, 60, 80, 100, 120]}
results = {}
for i in param['n_estimators']:
    cmodel.set_model(RandomForestClassifier(n_jobs=-1, max_depth=40))
    cmodel.training()
    results[i] = cmodel.evaluation()


In [40]:
# cmodel.set_model(RandomForestClassifier(n_jobs=-1, max_depth=10))
# cmodel.training()
# result = cmodel.evaluation()

array([[ 240, 2916],
       [   8, 3304]])

In [64]:
for i, v in results.items():
    print(i)
    print(v[-1])

10
              precision    recall  f1-score   support

           0     0.6597    0.6990    0.6788      3156
           1     0.6959    0.6564    0.6756      3312

    accuracy                         0.6772      6468
   macro avg     0.6778    0.6777    0.6772      6468
weighted avg     0.6782    0.6772    0.6771      6468

15
              precision    recall  f1-score   support

           0     0.6607    0.6930    0.6765      3156
           1     0.6932    0.6609    0.6767      3312

    accuracy                         0.6766      6468
   macro avg     0.6769    0.6769    0.6766      6468
weighted avg     0.6773    0.6766    0.6766      6468

20
              precision    recall  f1-score   support

           0     0.6624    0.6933    0.6775      3156
           1     0.6942    0.6633    0.6784      3312

    accuracy                         0.6780      6468
   macro avg     0.6783    0.6783    0.6780      6468
weighted avg     0.6787    0.6780    0.6780      6468

30
       

In [1]:
# 训练SVM
from sklearn import svm

clf_linear = svm.LinearSVC(C=1.0, sample_weights=cmodel.weights_train)
Cmodel.set_model(clf_linear)