In [1]:
import pandas as pd
import numpy as np
from numpy import sort
import copy
import math
import cmath
import datetime
from sklearn.feature_selection import RFE,RFECV  
from sklearn.feature_selection import SelectFromModel
import joblib
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt

# Function

In [2]:
def calculate_LR(df1,df2):
    """
    df1:cv training set
    df2:cv test set
    """
    # log ((2+c)/(1+c)) + log ((2+c)/ (1+c)), {c==1}

    p = {}
    n = {}
    for index,row in df1.iterrows():
        if(pd.isna(row['ancestor'])):
            continue
        for i in row['ancestor'].split(','):
            if i not in p.keys():
                p[i]=1
                n[i]=1
            if(row['is_del']==1):
                p[i]+=1
            else:
                n[i]+=1
                
    l = copy.deepcopy(p)
    for i in l.keys():
        l[i]=math.log(p[i]/n[i]) 
    l
    
    def LR_add(x):
        sum=0
        if(pd.isna(x)):
            return sum
        for i in x.split(','):
            if i in l:
                sum = sum + l[i]
        return sum
    df1['LR'] = df1['ancestor'].apply(lambda x:LR_add(x))
    df2['LR'] = df2['ancestor'].apply(lambda x:LR_add(x))
    df1 = df1.drop(columns=['ancestor'])
    df2 = df2.drop(columns=['ancestor'])
    return df1,df2

In [3]:
def calculate_PA(df1,df2):
    """
    df1:cv training set
    df2:cv test set
    """
    # log ((2+c)/(1+c)) + log ((2+c)/ (1+c)), {c==1}

    p = {}
    n = {}
    for index,row in df1.iterrows():
        if(pd.isna(row['site'])):
            continue
        for i in row['site'].split(','):
            if i!='':
                if i not in p.keys():
                    p[i]=1
                    n[i]=1
                if(row['is_del']==1):
                    p[i]+=1
                else:
                    n[i]+=1
                
    s = copy.deepcopy(p)
    for i in s.keys():
        s[i]=math.log(p[i]/n[i]) 
    s
    
    def PA_add(x):
        sum=0
        if(pd.isna(x)):
            return sum
        for i in x.split(','):
            if i != '' and i in s:
                sum = sum + s[i]
        return sum
    df1['PA'] = df1['site'].apply(lambda x:PA_add(x))
    df2['PA'] = df2['site'].apply(lambda x:PA_add(x))
    df1 = df1.drop(columns=['site'])
    df2 = df2.drop(columns=['site'])
    return df1,df2

In [20]:
def tolerance_metrics(y_true, y_pre):
    #     y_true, y_pre = y_test1, p_test1
    label = pd.DataFrame({'true': y_true, 'pre': y_pre})


    unique_state = label.true.unique()
    targets = {}
    state_map = {1: 'p', 0: 'n', '0': 'p', '0': 'n'}
    tp = fp = tn = fn = 0
    for i, (t, p) in label.iterrows():
        if t == 0 and p == 0:
            tn += 1
        if t == 0 and p == 1:
            fp += 1
        if t == 1 and p == 1:
            tp += 1
        if t == 1 and p == 0:
            fn += 1

    allp = tp + fn
    alln = fp + tn


    N = tp + tn + fp + fn
    # ppv
    ppv = tp / (tp + fp)
    # npv
    npv = tn / (tn + fn)
    # sensitivity -> TPR
    sen = tp / (tp + fn)
    # spciticity -> TNR
    spe = tn / (tn + fp)
    # acc
    acc = (tp + tn) / N
    # MCC
    mcc = (tp*tn-fp*fn) /(((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn))**0.5)
    # OPM
    opm = (ppv+npv)*(sen+spe)*(acc+(1+mcc)/2)/8
    columns = ['tp', 'tn', 'fp', 'fn', 'ppv', 'npv', 'tpr', 'tnr', 'acc', 'mcc', 'opm']
    res = pd.DataFrame(
        [
            [tp, tn, fp, fn, ppv, npv, sen, spe, acc, mcc, opm]
        ], 
        columns=columns,
    )
    

    return res.T

In [25]:
def no_reject(species,model,name):
    for i in range(1,11):
        df1 = pd.read_csv('./CV_dataset/{}/{}_train{}.csv'.format(species,species,i))
        df2 = pd.read_csv('./CV_dataset/{}/{}_validation{}.csv'.format(species,species,i))
        df1,df2 = calculate_LR(df1,df2)
        df1,df2 = calculate_PA(df1,df2)

        y_test = df2.is_del.values
        X_test = df2.iloc[:,df2.columns != "nutation"].iloc[:,5:]
        y_train = df1.is_del.values
        X_train = df1.iloc[:,df1.columns != "nutation"].iloc[:,5:]

        model.fit(X_train, y_train)
        y_pred=model.predict(X_test)
        if(i==1):
            performance = tolerance_metrics(y_test, y_pred)
        else:
            performance = pd.concat([performance,tolerance_metrics(y_test, y_pred)], axis=1)
    performance['average'] = performance.mean(axis=1)
    # # 构造成 pandas
    performance.to_csv('out/{}_re.csv'.format(name))

# RF

In [26]:
species = 'All_species'
model = RandomForestClassifier()
name = 'RF'
no_reject(species,model,name)

# XGB

In [None]:
species = 'All_species'
model = xgb.XGBClassifier()
name = 'XGB'
no_reject(species,model,name)

# LGBM

In [None]:
species = 'All_species'
model = lgb.LGBMClassifier()
name = 'LGBM'
no_reject(species,model,name)