In [1]:
import pandas as pd
import numpy as np
from numpy import sort
import copy
import math
import cmath
import datetime

from sklearn.feature_selection import RFE,RFECV
from sklearn.feature_selection import SelectFromModel
import joblib

from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt

# Function

In [2]:
def calculate_LR(df1,df2):
    """
    df1:cv training set
    df2:cv test set
    """
    # log ((2+c)/(1+c)) + log ((2+c)/ (1+c)), {c==1}

    p = {}
    n = {}
    for index,row in df1.iterrows():
        if(pd.isna(row['ancestor'])):
            continue
        for i in row['ancestor'].split(','):
            if i not in p.keys():
                p[i]=1
                n[i]=1
            if(row['is_del']==1):
                p[i]+=1
            else:
                n[i]+=1
                
    l = copy.deepcopy(p)
    for i in l.keys():
        l[i]=math.log(p[i]/n[i]) 
    l
    
    def LR_add(x):
        sum=0
        if(pd.isna(x)):
            return sum
        for i in x.split(','):
            if i in l:
                sum = sum + l[i]
        return sum
    df1['LR'] = df1['ancestor'].apply(lambda x:LR_add(x))
    df2['LR'] = df2['ancestor'].apply(lambda x:LR_add(x))
    df1 = df1.drop(columns=['ancestor'])
    df2 = df2.drop(columns=['ancestor'])
    return df1,df2

In [3]:
def calculate_PA(df1,df2):
    """
    df1:cv training set
    df2:cv test set
    """
    # log ((2+c)/(1+c)) + log ((2+c)/ (1+c)), {c==1}

    p = {}
    n = {}
    for index,row in df1.iterrows():
        if(pd.isna(row['site'])):
            continue
        for i in row['site'].split(','):
            if i!='':
                if i not in p.keys():
                    p[i]=1
                    n[i]=1
                if(row['is_del']==1):
                    p[i]+=1
                else:
                    n[i]+=1
                
    s = copy.deepcopy(p)
    for i in s.keys():
        s[i]=math.log(p[i]/n[i]) 
    s
    
    def PA_add(x):
        sum=0
        if(pd.isna(x)):
            return sum
        for i in x.split(','):
            if i != '' and i in s:
                sum = sum + s[i]
        return sum
    df1['PA'] = df1['site'].apply(lambda x:PA_add(x))
    df2['PA'] = df2['site'].apply(lambda x:PA_add(x))
    df1 = df1.drop(columns=['site'])
    df2 = df2.drop(columns=['site'])
    return df1,df2

In [15]:
def tolerance_metrics(y_true, y_pre):
    #     y_true, y_pre = y_test1, p_test1
    label = pd.DataFrame({'true': y_true, 'pre': y_pre})

    unique_state = label.true.unique()
    targets = {}
    state_map = {1: 'p', 0: 'n', '0': 'p', '0': 'n'}
    tp = fp = tn = fn = 0
    for i, (t, p) in label.iterrows():
        if t == 0 and p == 0:
            tn += 1
        if t == 0 and p == 1:
            fp += 1
        if t == 1 and p == 1:
            tp += 1
        if t == 1 and p == 0:
            fn += 1

    allp = tp + fn
    alln = fp + tn

    N = tp + tn + fp + fn
    # ppv
    ppv = tp / (tp + fp)
    # npv
    npv = tn / (tn + fn)
    # sensitivity -> TPR
    sen = tp / (tp + fn)
    # spciticity -> TNR
    spe = tn / (tn + fp)
    # acc
    acc = (tp + tn) / N
    # MCC
    mcc = (tp*tn-fp*fn) /(((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn))**0.5)
    # OPM
    opm = (ppv+npv)*(sen+spe)*(acc+(1+mcc)/2)/8
    columns = ['tp', 'tn', 'fp', 'fn', 'ppv', 'npv', 'tpr', 'tnr', 'acc', 'mcc', 'opm', 'N']
    res = pd.DataFrame(
        [
            [tp, tn, fp, fn, ppv, npv, sen, spe, acc, mcc, opm, N]
        ], 
        columns=columns,
    )
    

    return res.T

In [16]:
def no_reject(species,model,num):
    if(num!=-1):
        rfe = joblib.load('Feature_selected_2/lightgbm_feature_select_{}_N.rfe'.format(num))
        
    for i in range(1,11):
        df1 = pd.read_csv('./CV_dataset/{}/{}_train{}.csv'.format(species,species,i))
        df2 = pd.read_csv('./CV_dataset/{}/{}_validation{}.csv'.format(species,species,i))

        y_test = df2.is_del.values
        df2 = df2.iloc[:,df2.columns != 'nutation'].iloc[:,5:].iloc[:,:-1]
        X_test = df2.iloc[:,df2.columns != 'ancestor']
        y_train = df1.is_del.values
        df1 = df1.iloc[:,df1.columns != 'nutation'].iloc[:,5:].iloc[:,:-1]
        X_train = df1.iloc[:,df1.columns != 'ancestor']

        if(num!=-1):
            model.fit(pd.DataFrame(rfe.transform(X_train)), y_train)
            y_pred = model.predict(pd.DataFrame(rfe.transform(X_test)))
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
        if(i==1):
            performance = tolerance_metrics(y_test, y_pred)
        else:
            performance = pd.concat([performance,tolerance_metrics(y_test, y_pred)], axis=1)
    performance['average'] = performance.mean(axis=1)

    performance.to_csv('out/LGBM_N_cv/无拒绝_re_{}Fs_N.csv'.format(num))

In [17]:
def reject(species,model,num):
    if(num!=-1):
        rfe = joblib.load('Feature_selected_2/lightgbm_feature_select_{}_N.rfe'.format(num))
        
    for i in range(1,11):
        df1 = pd.read_csv('./CV_dataset/{}/{}_train{}.csv'.format(species,species,i))
        df2 = pd.read_csv('./CV_dataset/{}/{}_validation{}.csv'.format(species,species,i))

        y_test = df2.is_del.values
        df = df2.iloc[:,df2.columns != 'nutation'].iloc[:,5:].iloc[:,:-1]
        X_test = df.iloc[:,df.columns != 'ancestor']
        for j in range(200):
            data1 = pd.read_csv("All_species_bootstraps/bootstrap{}/bootstrap_Combine_train_{}.csv".format(i,j))
            y_train = data1.is_del.values
            X_train = data1.iloc[:,data1.columns != 'nutation'].iloc[:,5:].iloc[:,:-2]
            
            if(num!=-1):
                model.fit(pd.DataFrame(rfe.transform(X_train)), y_train)
                p_test = model.predict_proba(pd.DataFrame(rfe.transform(X_test)))
            else:
                model.fit(X_train, y_train)
                p_test = model.predict_proba(X_test)
                
            df = pd.DataFrame(p_test[:,-1],df2.index)
            df.to_csv("All_species_bootstraps/LGBM{}F_N/bootstrap_re_lgbm_{}/bootstrap_Combine_{}_re.csv".format(num,i,j))
            
            
        df1 = pd.read_csv('./CV_dataset/{}/{}_train{}.csv'.format(species,species,i))
        df2 = pd.read_csv('./CV_dataset/{}/{}_validation{}.csv'.format(species,species,i))
        
        data1 = pd.read_csv("All_species_bootstraps/LGBM{}F_N/bootstrap_re_lgbm_{}/bootstrap_Combine_0_re.csv".format(num,i))
        for j in range(1,200):
            data2 = pd.read_csv("All_species_bootstraps/LGBM{}F_N/bootstrap_re_lgbm_{}/bootstrap_Combine_{}_re.csv".format(num,i,j))
            data1 = pd.concat([data1,data2], ignore_index=True)

        d_mean = data1.groupby('Unnamed: 0')['0'].mean()
        d_std = data1.groupby('Unnamed: 0')['0'].std()

        data1 = pd.merge(pd.DataFrame(d_mean),pd.DataFrame(d_std),on='Unnamed: 0',how='outer')
        data1


        k = 20 ** 0.5
        def getA(row):
            return row['0_x']-k*row['0_y']

        def getB(row):
            return row['0_x']+k*row['0_y']
        data1["A"]=data1.apply(lambda row:getA(row),axis=1)
        data1["B"]=data1.apply(lambda row:getB(row),axis=1)
        data1

        def getT(row):
            if (row['A']<0.5) and (row['B']>0.5):
                return 1
            return 0
        data1["T"]=data1.apply(lambda row:getT(row),axis=1)
        data1

        data2 = pd.merge(data1[['T']],df2,left_index=True,right_index=True,how='outer')
        data2

        data3 = data2[data2['T']==0]
        data3


        del data3['T']
        data3

        data3.to_csv("All_species_bootstraps/LGBM{}F_N/Combine_train_lgbm_{}.csv".format(num,i),index=False,float_format='%.3f')


        y_train = df1.is_del.values
        df1 = df1.iloc[:,df1.columns != 'nutation'].iloc[:,5:].iloc[:,:-1]
        X_train = df1.iloc[:,df1.columns != 'ancestor']

        data3 = pd.read_csv("All_species_bootstraps/LGBM{}F_N/Combine_train_lgbm_{}.csv".format(num,i))
        y_test = data3.is_del.values
        data3 = data3.iloc[:,data3.columns != 'nutation'].iloc[:,5:].iloc[:,:-1]
        X_test = data3.iloc[:,data3.columns != 'ancestor']
        
        if(num!=-1):
            model.fit(pd.DataFrame(rfe.transform(X_train)), y_train)
            y_pred=model.predict(pd.DataFrame(rfe.transform(X_test)))
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
        if(i==1):
            performance = tolerance_metrics(y_test, y_pred)
        else:
            performance = pd.concat([performance,tolerance_metrics(y_test, y_pred)], axis=1)
            
    performance['average'] = performance.mean(axis=1)

    performance.to_csv('out/LGBM_N_cv/有拒绝_cv_re_{}Fs_N.csv'.format(name,num))

# LGBM_N_CV(WithoutGO)

10Fs

In [18]:
species = 'All_species'
model = lgb.LGBMClassifier()
num=10
no_reject(species,model,num)
reject(species,model,num)

20Fs

In [19]:
model = lgb.LGBMClassifier()
num=20
no_reject(species,model,num)
reject(species,model,num)

50Fs

In [20]:
model = lgb.LGBMClassifier()
num=50
no_reject(species,model,num)
reject(species,model,num)

100Fs

In [21]:
model = lgb.LGBMClassifier()
num=100
no_reject(species,model,num)
reject(species,model,num)

-1Fs

In [22]:
model = lgb.LGBMClassifier()
num=-1
no_reject(species,model,num)
reject(species,model,num)