# Experiments on pima.txt

In [1]:
import pandas as pd
df = pd.read_csv("pima.txt", header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,positive
1,1,85,66,29,0,26.6,0.351,31,negative
2,8,183,64,0,0,23.3,0.672,32,positive
3,1,89,66,23,94,28.1,0.167,21,negative
4,0,137,40,35,168,43.1,2.288,33,positive
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,negative
764,2,122,70,27,0,36.8,0.340,27,negative
765,5,121,72,23,112,26.2,0.245,30,negative
766,1,126,60,0,0,30.1,0.349,47,positive


In [2]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
df['label'] = df[df.shape[1] - 1]
df.drop([df.shape[1] - 2], axis=1, inplace=True)
labelencoder = LabelEncoder()
df['label'] = labelencoder.fit_transform(df['label'])
x = np.array(df.drop(['label'], axis=1))
y = np.array(df['label'])

In [3]:
from sklearn.preprocessing import Normalizer
normalization_object = Normalizer()
X = normalization_object.fit_transform(x)

In [4]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True)
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

## cusboost

In [5]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import roc_auc_score, average_precision_score
from cusboost import CUSBoostClassifier

warnings.filterwarnings('ignore')
for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        tprs = []
        for train_index, test_index in skf.split(X, y):
            X_train = X[train_index]
            X_test = X[test_index]
            y_train = y[train_index]
            y_test = y[test_index]
            classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
            classifier.fit(X, y)
            predictions = classifier.predict_proba_samme(X_test)
            auc = roc_auc_score(y_test, predictions[:, 1])
            aupr = average_precision_score(y_test, predictions[:, 1])
            current_param_auc.append(auc)
            current_param_aupr.append(aupr)
            fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
            tprs.append(interp(mean_fpr, fpr, tpr))
            tprs[-1][0] = 0.0
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        if top_auc < current_mean_auc:
            top_auc = current_mean_auc
            best_depth = depth
            best_estimators = estimators
            best_auc = top_auc
            best_aupr = current_mean_aupr
            best_tpr = np.mean(tprs, axis=0)
            best_fpr = mean_fpr
            best_precision, best_recall, _ = precision_recall_curve(y_test, predictions[:, 1])
            best_fpr, best_tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        print('Result: ROC: ', top_auc, ' Aupr: ', best_aupr, ' for depth= ', best_depth, ' estimators = ', best_estimators)



Result: ROC:  0.7524416491963661  Aupr:  0.6266355860757725  for depth=  2  estimators =  20
Result: ROC:  0.7524416491963661  Aupr:  0.6266355860757725  for depth=  2  estimators =  20
Result: ROC:  0.7554828791055206  Aupr:  0.5851593865045149  for depth=  2  estimators =  40
Result: ROC:  0.9098294898672258  Aupr:  0.8593509589017245  for depth=  12  estimators =  20
Result: ROC:  0.9194570230607966  Aupr:  0.8575327778697035  for depth=  12  estimators =  30
Result: ROC:  0.9244252271139064  Aupr:  0.86429099336812  for depth=  12  estimators =  40


## rusboost

In [6]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import roc_auc_score, average_precision_score
from rusboost import RusBoostClassifier

warnings.filterwarnings('ignore')
for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        tprs = []
        for train_index, test_index in skf.split(X, y):
            X_train = X[train_index]
            X_test = X[test_index]
            y_train = y[train_index]
            y_test = y[test_index]
            classifier = RusBoostClassifier(depth=depth, n_estimators=estimators)
            classifier.fit(X, y)
            predictions = classifier.predict_proba_samme(X_test)
            auc = roc_auc_score(y_test, predictions[:, 1])
            aupr = average_precision_score(y_test, predictions[:, 1])
            current_param_auc.append(auc)
            current_param_aupr.append(aupr)
            fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
            tprs.append(interp(mean_fpr, fpr, tpr))
            tprs[-1][0] = 0.0
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        if top_auc < current_mean_auc:
            top_auc = current_mean_auc
            best_depth = depth
            best_estimators = estimators
            best_auc = top_auc
            best_aupr = current_mean_aupr
            best_tpr = np.mean(tprs, axis=0)
            best_fpr = mean_fpr
            best_precision, best_recall, _ = precision_recall_curve(y_test, predictions[:, 1])
            best_fpr, best_tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        print('Result: ROC: ', top_auc, ' Aupr: ', best_aupr, ' for depth= ', best_depth, ' estimators = ', best_estimators)

Result: ROC:  0.9244252271139064  Aupr:  0.86429099336812  for depth=  12  estimators =  40
Result: ROC:  0.9244252271139064  Aupr:  0.86429099336812  for depth=  12  estimators =  40
Result: ROC:  0.9244252271139064  Aupr:  0.86429099336812  for depth=  12  estimators =  40
Result: ROC:  0.9946135569531795  Aupr:  0.9875099306141809  for depth=  12  estimators =  20
Result: ROC:  0.9946135569531795  Aupr:  0.9875099306141809  for depth=  12  estimators =  20
Result: ROC:  0.9946135569531795  Aupr:  0.9875099306141809  for depth=  12  estimators =  20


# Experiments on uscecchini28.csv

In [1]:
import pandas as pd
df = pd.read_csv("uscecchini28.csv")
df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
0,1990,1009,3460.0,0,0,0,,,0,10.047,...,0.312448,0.095082,0.082631,-0.019761,1,0.413170,0.873555,0.167620,0.161961,-0.042140
1,1990,1011,4841.0,0,0,0,,,0,1.247,...,0.315904,0.188832,-0.211389,-0.117832,1,0.157887,0.745139,-0.428957,-0.157888,0.100228
2,1990,1017,3812.0,0,0,0,,,0,55.040,...,0.605342,0.097551,-0.105780,0.091206,1,2.231337,1.015131,0.394768,0.063681,0.066348
3,1990,1021,3861.0,0,0,0,,,0,24.684,...,0.793068,-0.005725,-0.249704,0.017545,1,1.043582,1.026261,0.094822,0.088347,-0.017358
4,1990,1028,7385.0,0,0,0,,,0,17.325,...,0.869182,-0.231536,-1.674893,-0.466667,0,-1.602508,0.598443,-0.942379,-0.700821,0.130349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146040,2014,314866,8200.0,0,0,0,,,0,262.600,...,0.751944,0.560406,0.127217,-0.050591,1,0.103693,0.829680,-0.327178,-0.008179,-0.261606
146041,2014,315318,2890.0,0,0,0,,,0,1578.400,...,0.742781,-0.118178,0.031360,0.095355,1,0.581796,0.743084,-0.077826,0.000461,-0.296702
146042,2014,316056,3420.0,0,0,0,,,0,973.800,...,0.751129,0.004207,-0.037925,0.072050,1,-0.000903,1.063878,-0.002877,0.153133,0.065569
146043,2014,317260,4412.0,0,0,0,,,0,51.743,...,0.018001,,,,1,1.109467,,0.000000,0.028804,


In [2]:
df = df.fillna(0)
df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
0,1990,1009,3460.0,0,0,0,0.0,0.0,0,10.047,...,0.312448,0.095082,0.082631,-0.019761,1,0.413170,0.873555,0.167620,0.161961,-0.042140
1,1990,1011,4841.0,0,0,0,0.0,0.0,0,1.247,...,0.315904,0.188832,-0.211389,-0.117832,1,0.157887,0.745139,-0.428957,-0.157888,0.100228
2,1990,1017,3812.0,0,0,0,0.0,0.0,0,55.040,...,0.605342,0.097551,-0.105780,0.091206,1,2.231337,1.015131,0.394768,0.063681,0.066348
3,1990,1021,3861.0,0,0,0,0.0,0.0,0,24.684,...,0.793068,-0.005725,-0.249704,0.017545,1,1.043582,1.026261,0.094822,0.088347,-0.017358
4,1990,1028,7385.0,0,0,0,0.0,0.0,0,17.325,...,0.869182,-0.231536,-1.674893,-0.466667,0,-1.602508,0.598443,-0.942379,-0.700821,0.130349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146040,2014,314866,8200.0,0,0,0,0.0,0.0,0,262.600,...,0.751944,0.560406,0.127217,-0.050591,1,0.103693,0.829680,-0.327178,-0.008179,-0.261606
146041,2014,315318,2890.0,0,0,0,0.0,0.0,0,1578.400,...,0.742781,-0.118178,0.031360,0.095355,1,0.581796,0.743084,-0.077826,0.000461,-0.296702
146042,2014,316056,3420.0,0,0,0,0.0,0.0,0,973.800,...,0.751129,0.004207,-0.037925,0.072050,1,-0.000903,1.063878,-0.002877,0.153133,0.065569
146043,2014,317260,4412.0,0,0,0,0.0,0.0,0,51.743,...,0.018001,0.000000,0.000000,0.000000,1,1.109467,0.000000,0.000000,0.028804,0.000000


In [3]:
train_df = df[(df['fyear'] >= 1991) & (df['fyear'] <= 2001)]
test_df = df[df['fyear'] == 2003]

In [4]:
train_df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
4582,1991,1004,5080.0,0,0,0,0.0,0.0,0,289.537,...,0.836553,-0.095484,0.155827,-0.012672,1,0.961101,0.968053,0.259703,0.055586,-0.020650
4583,1991,1009,3460.0,0,0,0,0.0,0.0,0,12.911,...,0.363340,-0.111586,-0.187708,-0.041154,1,0.258828,0.859404,0.160184,0.105796,-0.023684
4584,1991,1011,4841.0,0,0,0,0.0,0.0,0,3.163,...,0.350917,0.122455,-0.194342,0.067808,1,0.151430,0.739250,-0.548968,-0.087615,0.139118
4585,1991,1013,3661.0,0,0,0,0.0,0.0,0,119.530,...,0.644304,0.148275,0.034543,-0.038006,1,0.483833,1.044093,0.541099,0.154582,-0.097427
4586,1991,1014,6512.0,1,0,0,0.0,0.0,0,6.826,...,0.245489,0.058214,-0.436089,0.115207,0,1.017569,0.635725,0.034610,0.257898,-0.198310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71743,2001,233149,4899.0,0,0,0,0.0,0.0,0,71.622,...,0.841115,0.665356,-0.233796,-0.278447,1,0.719467,0.807300,-0.258992,-0.218409,-0.393693
71744,2001,233397,4813.0,0,0,0,0.0,0.0,0,371.861,...,0.225108,3.768149,-1.085016,-0.091807,1,-0.075099,0.579202,-0.381579,-0.041988,-0.233517
71745,2001,241216,2870.0,0,0,0,0.0,0.0,0,4320.000,...,0.753478,0.537465,0.203494,-0.058259,1,0.760299,0.536431,-0.016995,0.032870,-0.064820
71746,2001,244818,3760.0,0,0,0,0.0,0.0,0,3.413,...,0.080238,-6.484197,-1.162185,-0.432618,1,0.010383,1.223529,-2.199320,-0.751347,0.269884


In [5]:
test_df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
77815,2003,1004,5080.0,0,0,0,0.0,0.0,0,432.204,...,0.707952,-0.006354,-0.219484,0.022789,1,0.976617,1.012580,0.185938,0.028939,0.030194
77816,2003,1013,3661.0,0,0,0,0.0,0.0,0,1006.000,...,0.275966,-0.349137,0.216712,0.565607,1,0.302803,1.427923,-0.594649,-0.060529,1.291139
77817,2003,1021,3861.0,0,0,0,0.0,0.0,0,5.489,...,0.816347,-0.100448,0.163097,-0.228271,1,0.837526,1.441992,-1.707148,-0.214593,0.193321
77818,2003,1034,2834.0,0,0,0,0.0,0.0,0,692.991,...,0.768092,0.013905,-0.217351,0.048507,1,1.082328,0.904510,0.031126,0.033236,0.006734
77819,2003,1038,7830.0,0,0,0,0.0,0.0,0,435.736,...,0.262861,-0.017864,-0.019820,0.007507,1,0.495547,0.935015,-0.141191,0.051776,0.078478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83791,2003,252614,3674.0,0,0,0,0.0,0.0,0,18.884,...,0.472912,-0.036853,-0.116684,0.102453,1,0.302557,0.626105,-1.264040,-0.067801,0.071843
83792,2003,254096,2836.0,0,0,0,0.0,0.0,0,16.916,...,0.355976,0.000000,0.000000,0.000000,1,0.344360,0.000000,-1.791983,-0.326000,0.000000
83793,2003,254338,4813.0,0,0,0,0.0,0.0,0,3081.421,...,0.121258,0.492895,-0.069278,0.011349,1,0.513978,0.969363,0.171123,0.109648,-0.017323
83794,2003,264387,3841.0,0,0,0,0.0,0.0,0,40.375,...,0.800346,-0.238429,-0.608114,0.032144,0,0.235486,1.118996,-0.168644,0.103346,-0.861634


In [7]:
import numpy as np
train = np.array(train_df)
test = np.array(test_df)
x_train = train[:, 9:37]
x_test = test[:, 9:37]
y_train = train[:, 8]
y_test = test[:, 8]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(67166, 28)
(67166,)
(5981, 28)
(5981,)


## rusboost: 1991-2001, 2003

In [20]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from rusboost import RusBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = RusBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.2319453432958757  Aupr:  0.006199610398338218  Ndcg:  0.0  Sensitivity:  0.3317929358673956  for depth=  2  estimators =  20
AUC:  0.723257535643545  Aupr:  0.020360564728371763  Ndcg:  0.026232950058236792  Sensitivity:  0.6422019128225391  for depth=  2  estimators =  30
AUC:  0.6434995881626169  Aupr:  0.016269387213738135  Ndcg:  0.046738891803005396  Sensitivity:  0.7067429748589169  for depth=  2  estimators =  40
AUC:  0.7209581102547508  Aupr:  0.021531954145510346  Ndcg:  0.014231278356345666  Sensitivity:  0.7036046079524341  for depth=  12  estimators =  20
AUC:  0.731777421505756  Aupr:  0.02302353632167718  Ndcg:  0.062976377433132  Sensitivity:  0.6953125434977478  for depth=  12  estimators =  30
AUC:  0.7398021219430879  Aupr:  0.022134459548049778  Ndcg:  0.07605683298404324  Sensitivity:  0.7374539331319562  for depth=  12  estimators =  40


## cusboost: 1991-2001, 2003

In [21]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from cusboost import CUSBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.7116415151693436  Aupr:  0.04025993622421441  Ndcg:  0.06794181380847784  Sensitivity:  0.1739573638257324  for depth=  2  estimators =  20
AUC:  0.6694943715557646  Aupr:  0.05133355942698664  Ndcg:  0.07004127987062791  Sensitivity:  0.11706595681750961  for depth=  2  estimators =  30
AUC:  0.7339456962993469  Aupr:  0.03498305959216607  Ndcg:  0.06367641987523748  Sensitivity:  0.3076535750251762  for depth=  2  estimators =  40
AUC:  0.7458154380184738  Aupr:  0.023314475044509196  Ndcg:  0.08420560541377735  Sensitivity:  0.7231493503066385  for depth=  12  estimators =  20
AUC:  0.7593521405738267  Aupr:  0.028087748854624384  Ndcg:  0.14567970853654694  Sensitivity:  0.7121772907739297  for depth=  12  estimators =  30
AUC:  0.7642157439548156  Aupr:  0.02474833209329573  Ndcg:  0.05937817684465589  Sensitivity:  0.7406307743331386  for depth=  12  estimators =  40


## rusboost: 1991-2002, 2004

In [22]:
train_df = df[(df['fyear'] >= 1991) & (df['fyear'] <= 2002)]
test_df = df[df['fyear'] == 2004]

In [23]:
train_df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
4582,1991,1004,5080.0,0,0,0,0.0,0.0,0,289.537,...,0.836553,-0.095484,0.155827,-0.012672,1,0.961101,0.968053,0.259703,0.055586,-0.020650
4583,1991,1009,3460.0,0,0,0,0.0,0.0,0,12.911,...,0.363340,-0.111586,-0.187708,-0.041154,1,0.258828,0.859404,0.160184,0.105796,-0.023684
4584,1991,1011,4841.0,0,0,0,0.0,0.0,0,3.163,...,0.350917,0.122455,-0.194342,0.067808,1,0.151430,0.739250,-0.548968,-0.087615,0.139118
4585,1991,1013,3661.0,0,0,0,0.0,0.0,0,119.530,...,0.644304,0.148275,0.034543,-0.038006,1,0.483833,1.044093,0.541099,0.154582,-0.097427
4586,1991,1014,6512.0,1,0,0,0.0,0.0,0,6.826,...,0.245489,0.058214,-0.436089,0.115207,0,1.017569,0.635725,0.034610,0.257898,-0.198310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77810,2002,244818,3760.0,0,0,0,0.0,0.0,0,4.872,...,0.067889,-0.354610,2.120761,-0.701408,1,0.030881,0.536265,-2.578969,-1.083602,-0.413143
77811,2002,249158,4412.0,0,0,0,0.0,0.0,0,71.022,...,0.013341,0.140719,-0.016401,0.024218,1,0.543739,1.178880,0.028183,0.064690,0.009596
77812,2002,252614,3674.0,0,0,0,0.0,0.0,0,10.821,...,0.568342,0.000000,0.000000,0.000000,1,1.276576,1.765173,-1.500604,-0.235980,0.180804
77813,2002,254338,4813.0,0,0,0,0.0,0.0,0,3200.725,...,0.112064,0.083356,0.121587,0.048667,1,1.155478,0.977792,0.218314,0.104528,-0.044975


In [24]:
test_df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
83796,2004,1004,5080.0,0,0,0,0.0,0.0,0,474.542,...,0.741236,0.182147,0.372365,0.016419,0,0.602174,0.827183,0.194543,0.047713,0.002089
83797,2004,1021,3861.0,0,0,0,0.0,0.0,0,5.807,...,0.884548,0.072180,0.106097,0.436975,0,0.191637,1.070634,-1.436990,0.232826,-0.017577
83798,2004,1045,4512.0,0,0,0,0.0,0.0,0,4971.000,...,0.216488,0.026483,-0.126412,0.015015,1,-0.329242,1.044921,-0.068675,0.003823,0.037657
83799,2004,1050,3564.0,0,0,0,0.0,0.0,0,21.779,...,0.776156,-0.030364,-0.202920,-0.006752,1,0.210874,0.974546,-0.170277,0.031882,0.017093
83800,2004,1056,3825.0,0,0,0,0.0,0.0,0,327.480,...,0.686477,0.298371,0.102035,0.007839,1,0.402660,0.771520,0.089214,0.047166,-0.250891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89725,2004,269005,4911.0,0,0,0,0.0,0.0,0,1214.267,...,0.595114,0.284639,-0.188115,0.049676,1,0.516199,2.074633,0.001105,0.095256,-0.020758
89726,2004,270281,3312.0,0,0,0,0.0,0.0,0,2103.205,...,0.374846,0.000000,0.000000,0.000000,1,0.685137,1.057751,0.441486,0.426802,0.211946
89727,2004,270287,4899.0,0,0,0,0.0,0.0,0,112.420,...,0.773637,0.000000,0.000000,0.000000,1,0.315422,0.000000,-0.077990,0.085078,0.000000
89728,2004,274075,0.0,0,0,0,0.0,0.0,0,0.228,...,0.145258,0.000000,0.000000,0.000000,1,0.140871,0.000000,-2.425624,-0.013145,0.000000


In [25]:
import numpy as np
train = np.array(train_df)
test = np.array(test_df)
x_train = train[:, 9:37]
x_test = test[:, 9:37]
y_train = train[:, 8]
y_test = test[:, 8]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(73233, 28)
(73233,)
(5934, 28)
(5934,)


In [26]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from rusboost import RusBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = RusBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.733125396117462  Aupr:  0.018907918930654685  Ndcg:  0.04519087485951688  Sensitivity:  0.7326924772907664  for depth=  2  estimators =  20
AUC:  0.7314705053871975  Aupr:  0.017245820306988572  Ndcg:  0.012059506902607672  Sensitivity:  0.7553039400697662  for depth=  2  estimators =  30
AUC:  0.6675914297786437  Aupr:  0.015006029009994372  Ndcg:  0.01247327799883532  Sensitivity:  0.6205900260142801  for depth=  2  estimators =  40
AUC:  0.7910069599305181  Aupr:  0.022608734102569795  Ndcg:  0.07038235133621958  Sensitivity:  0.7327864037594463  for depth=  12  estimators =  20
AUC:  0.75870138024929  Aupr:  0.020688430341900845  Ndcg:  0.09568993917075913  Sensitivity:  0.6951167505854363  for depth=  12  estimators =  30
AUC:  0.7656393042416845  Aupr:  0.021192038398420692  Ndcg:  0.05913711960254216  Sensitivity:  0.7387813822647252  for depth=  12  estimators =  40


## cusboost: 1991-2002, 2004

In [27]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from cusboost import CUSBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.7255786249149081  Aupr:  0.03200276993438338  Ndcg:  0.04963706987783095  Sensitivity:  0.1523371647509579  for depth=  2  estimators =  20
AUC:  0.718275392596418  Aupr:  0.02315857678792294  Ndcg:  0.03130080872308706  Sensitivity:  0.28897583627139056  for depth=  2  estimators =  30
AUC:  0.7220707847233633  Aupr:  0.025127449541379365  Ndcg:  0.03598170796642474  Sensitivity:  0.3371330863493873  for depth=  2  estimators =  40
AUC:  0.7058387713903429  Aupr:  0.01830005662710687  Ndcg:  0.06297029110528543  Sensitivity:  0.6885201406452408  for depth=  12  estimators =  20
AUC:  0.7008462242670359  Aupr:  0.018486995510476886  Ndcg:  0.060511501440496065  Sensitivity:  0.688321967485449  for depth=  12  estimators =  30
AUC:  0.7274770545292363  Aupr:  0.021257380870021207  Ndcg:  0.0780155438458821  Sensitivity:  0.6858440600474478  for depth=  12  estimators =  40


## rusboost: 1991-2003, 2005

In [28]:
train_df = df[(df['fyear'] >= 1991) & (df['fyear'] <= 2003)]
test_df = df[df['fyear'] == 2005]

In [29]:
test_df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
89730,2005,1013,3661.0,0,0,0,0.0,0.0,0,853.000,...,0.565798,0.545828,0.067576,0.062682,1,0.380683,0.651258,-0.422476,0.084104,-0.292194
89731,2005,1021,3861.0,0,0,0,0.0,0.0,0,7.726,...,0.911689,0.171468,0.056120,0.045031,1,0.236020,0.931946,-0.867533,0.166074,-0.200583
89732,2005,1034,2834.0,0,0,0,0.0,0.0,0,1037.047,...,0.374649,-0.497353,0.222937,0.219029,1,0.595473,0.673763,-0.107975,0.101711,0.564341
89733,2005,1045,4512.0,0,0,0,0.0,0.0,0,6164.000,...,0.224614,0.104918,-0.027229,-0.003358,1,-0.363848,1.070900,-0.106866,0.003255,0.027974
89734,2005,1050,3564.0,0,0,0,0.0,0.0,0,21.795,...,0.787739,0.235590,0.672269,0.011864,0,0.118256,0.925700,-0.183287,0.061189,0.021542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95588,2005,270281,3312.0,0,0,0,0.0,0.0,0,1359.483,...,0.494245,0.080589,-0.192915,-0.382382,1,0.680608,0.976028,0.488680,0.160000,-0.481795
95589,2005,270287,4899.0,0,0,0,0.0,0.0,0,144.780,...,0.736825,0.000000,0.000000,0.000000,1,0.314446,0.882678,0.024778,0.106001,-0.154301
95590,2005,270989,8200.0,0,0,0,0.0,0.0,0,57.224,...,0.387777,0.000000,0.000000,0.000000,0,0.030482,0.000000,-0.066561,0.079794,0.000000
95591,2005,272705,0.0,0,0,0,0.0,0.0,0,127.345,...,0.920693,0.000000,0.000000,0.000000,1,0.371865,1.118226,0.147289,0.089021,-0.226063


In [30]:
import numpy as np
train = np.array(train_df)
test = np.array(test_df)
x_train = train[:, 9:37]
x_test = test[:, 9:37]
y_train = train[:, 8]
y_test = test[:, 8]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(79214, 28)
(79214,)
(5863, 28)
(5863,)


In [31]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from rusboost import RusBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = RusBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.680884992933807  Aupr:  0.011337288426394724  Ndcg:  0.0  Sensitivity:  0.8231720186944068  for depth=  2  estimators =  20
AUC:  0.7182116802261183  Aupr:  0.01441316070763762  Ndcg:  0.0208210657429204  Sensitivity:  0.7522734399498276  for depth=  2  estimators =  30
AUC:  0.7284347427523776  Aupr:  0.01426210856656531  Ndcg:  0.01665685259433632  Sensitivity:  0.6547122667291325  for depth=  2  estimators =  40
AUC:  0.7349146327489401  Aupr:  0.013885813375909179  Ndcg:  0.01927016395071518  Sensitivity:  0.7390968508615569  for depth=  12  estimators =  20
AUC:  0.7508651312020168  Aupr:  0.01580529737374663  Ndcg:  0.03678680610024773  Sensitivity:  0.7442198210758686  for depth=  12  estimators =  30
AUC:  0.7341087047859134  Aupr:  0.01481014365825533  Ndcg:  0.026273259756551766  Sensitivity:  0.7305182635303117  for depth=  12  estimators =  40


## cusboost: 1991-2003, 2005

In [32]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from cusboost import CUSBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.6752530461021351  Aupr:  0.013616795741434087  Ndcg:  0.01651026089576714  Sensitivity:  0.26055101422948834  for depth=  2  estimators =  20
AUC:  0.6705759902219166  Aupr:  0.014507328204942584  Ndcg:  0.02407455101092244  Sensitivity:  0.3463040446304045  for depth=  2  estimators =  30
AUC:  0.6805278637179634  Aupr:  0.01666164320641024  Ndcg:  0.027761420990560528  Sensitivity:  0.3785318252384121  for depth=  2  estimators =  40
AUC:  0.6910278446201443  Aupr:  0.015104726257104616  Ndcg:  0.10181102802209266  Sensitivity:  0.6664352282354888  for depth=  12  estimators =  20
AUC:  0.703296283564417  Aupr:  0.015467672173982221  Ndcg:  0.12007372737765629  Sensitivity:  0.6732780996283145  for depth=  12  estimators =  30
AUC:  0.7048202895229365  Aupr:  0.015413614089641703  Ndcg:  0.10973237467939957  Sensitivity:  0.6696332869139553  for depth=  12  estimators =  40


## rusboost: 1991-2004, 2006

In [33]:
train_df = df[(df['fyear'] >= 1991) & (df['fyear'] <= 2004)]
test_df = df[df['fyear'] == 2006]

In [34]:
test_df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
95593,2006,1013,3661.0,0,0,0,0.0,0.0,0,942.700,...,0.538166,0.170354,-0.060640,-0.032957,1,0.520830,0.943020,-0.352116,0.027181,-0.038457
95594,2006,1021,3861.0,0,0,0,0.0,0.0,0,13.582,...,0.610139,0.087639,-0.081807,-0.174568,1,0.204218,0.859982,-0.416777,0.072240,0.242287
95595,2006,1034,2834.0,0,0,0,0.0,0.0,0,353.541,...,0.626191,-0.076063,0.089925,-0.009034,1,0.697034,1.532180,-0.096779,0.127191,-0.347585
95596,2006,1045,4512.0,0,0,0,0.0,0.0,0,6902.000,...,0.206588,0.097728,0.214712,0.037432,1,-0.090208,0.974812,-0.109281,0.043266,0.006924
95597,2006,1050,3564.0,0,0,0,0.0,0.0,0,42.653,...,0.857964,0.475099,-0.539549,0.068405,1,0.144855,0.938543,-0.083212,0.108549,-0.089737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101496,2006,277579,0.0,0,0,0,0.0,0.0,0,131.250,...,0.004515,0.000000,0.000000,0.000000,1,0.337848,0.000000,-0.088249,-0.069826,-0.297817
101497,2006,277846,0.0,0,0,0,0.0,0.0,0,78.441,...,0.918373,0.000000,0.000000,0.000000,1,0.728082,0.000000,-0.001117,0.026559,0.000000
101498,2006,278234,0.0,0,0,0,0.0,0.0,0,115.035,...,0.615253,0.412875,-0.066037,-0.061456,1,0.095816,1.523967,0.457610,0.550619,-0.734577
101499,2006,278400,0.0,0,0,0,0.0,0.0,0,24.733,...,0.106035,0.000000,0.000000,0.000000,1,0.300921,0.000000,-0.033473,-0.020385,0.000000


In [35]:
import numpy as np
train = np.array(train_df)
test = np.array(test_df)
x_train = train[:, 9:37]
x_test = test[:, 9:37]
y_train = train[:, 8]
y_test = test[:, 8]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(85148, 28)
(85148,)
(5908, 28)
(5908,)


In [36]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from rusboost import RusBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = RusBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.6690135396518375  Aupr:  0.008587977622564011  Ndcg:  0.0  Sensitivity:  0.7038265870382658  for depth=  2  estimators =  20
AUC:  0.7264861379754997  Aupr:  0.009979978605723108  Ndcg:  0.0  Sensitivity:  0.7592776042063552  for depth=  2  estimators =  30
AUC:  0.6957704706640877  Aupr:  0.00815246947254706  Ndcg:  0.0  Sensitivity:  0.7577527382848603  for depth=  2  estimators =  40
AUC:  0.7391025145067699  Aupr:  0.010477940120636846  Ndcg:  0.017352108422616152  Sensitivity:  0.7119140568014052  for depth=  12  estimators =  20
AUC:  0.738728562217924  Aupr:  0.010620278805809654  Ndcg:  0.0  Sensitivity:  0.6816998106060607  for depth=  12  estimators =  30
AUC:  0.7265402965828498  Aupr:  0.009574296969816756  Ndcg:  0.017894278886829786  Sensitivity:  0.6913928771425437  for depth=  12  estimators =  40


In [37]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from cusboost import CUSBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.6371192778852354  Aupr:  0.012539062962937745  Ndcg:  0.025075948096980688  Sensitivity:  0.19798778296802014  for depth=  2  estimators =  20
AUC:  0.6337949709864603  Aupr:  0.010896741176518404  Ndcg:  0.019825627262858934  Sensitivity:  0.286770140428677  for depth=  2  estimators =  30
AUC:  0.6429864603481624  Aupr:  0.009886573482729564  Ndcg:  0.02109868403320897  Sensitivity:  0.3720385674931129  for depth=  2  estimators =  40
AUC:  0.7068678272082527  Aupr:  0.010761355707778537  Ndcg:  0.02562425151388268  Sensitivity:  0.6247516414689668  for depth=  12  estimators =  20
AUC:  0.7191850419084462  Aupr:  0.009457799954173873  Ndcg:  0.02088838217572278  Sensitivity:  0.6984505690388043  for depth=  12  estimators =  30
AUC:  0.7502566086395873  Aupr:  0.012813419157708996  Ndcg:  0.058873391762003735  Sensitivity:  0.6899479542605326  for depth=  12  estimators =  40


# Experiments on indicators

In [1]:
import pandas as pd
df = pd.read_csv("uscecchini28.csv")
df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
0,1990,1009,3460.0,0,0,0,,,0,10.047,...,0.312448,0.095082,0.082631,-0.019761,1,0.413170,0.873555,0.167620,0.161961,-0.042140
1,1990,1011,4841.0,0,0,0,,,0,1.247,...,0.315904,0.188832,-0.211389,-0.117832,1,0.157887,0.745139,-0.428957,-0.157888,0.100228
2,1990,1017,3812.0,0,0,0,,,0,55.040,...,0.605342,0.097551,-0.105780,0.091206,1,2.231337,1.015131,0.394768,0.063681,0.066348
3,1990,1021,3861.0,0,0,0,,,0,24.684,...,0.793068,-0.005725,-0.249704,0.017545,1,1.043582,1.026261,0.094822,0.088347,-0.017358
4,1990,1028,7385.0,0,0,0,,,0,17.325,...,0.869182,-0.231536,-1.674893,-0.466667,0,-1.602508,0.598443,-0.942379,-0.700821,0.130349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146040,2014,314866,8200.0,0,0,0,,,0,262.600,...,0.751944,0.560406,0.127217,-0.050591,1,0.103693,0.829680,-0.327178,-0.008179,-0.261606
146041,2014,315318,2890.0,0,0,0,,,0,1578.400,...,0.742781,-0.118178,0.031360,0.095355,1,0.581796,0.743084,-0.077826,0.000461,-0.296702
146042,2014,316056,3420.0,0,0,0,,,0,973.800,...,0.751129,0.004207,-0.037925,0.072050,1,-0.000903,1.063878,-0.002877,0.153133,0.065569
146043,2014,317260,4412.0,0,0,0,,,0,51.743,...,0.018001,,,,1,1.109467,,0.000000,0.028804,


In [2]:
df = df.fillna(0)
df

Unnamed: 0,fyear,gvkey,sich,insbnk,understatement,option,p_aaer,new_p_aaer,misstate,act,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
0,1990,1009,3460.0,0,0,0,0.0,0.0,0,10.047,...,0.312448,0.095082,0.082631,-0.019761,1,0.413170,0.873555,0.167620,0.161961,-0.042140
1,1990,1011,4841.0,0,0,0,0.0,0.0,0,1.247,...,0.315904,0.188832,-0.211389,-0.117832,1,0.157887,0.745139,-0.428957,-0.157888,0.100228
2,1990,1017,3812.0,0,0,0,0.0,0.0,0,55.040,...,0.605342,0.097551,-0.105780,0.091206,1,2.231337,1.015131,0.394768,0.063681,0.066348
3,1990,1021,3861.0,0,0,0,0.0,0.0,0,24.684,...,0.793068,-0.005725,-0.249704,0.017545,1,1.043582,1.026261,0.094822,0.088347,-0.017358
4,1990,1028,7385.0,0,0,0,0.0,0.0,0,17.325,...,0.869182,-0.231536,-1.674893,-0.466667,0,-1.602508,0.598443,-0.942379,-0.700821,0.130349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146040,2014,314866,8200.0,0,0,0,0.0,0.0,0,262.600,...,0.751944,0.560406,0.127217,-0.050591,1,0.103693,0.829680,-0.327178,-0.008179,-0.261606
146041,2014,315318,2890.0,0,0,0,0.0,0.0,0,1578.400,...,0.742781,-0.118178,0.031360,0.095355,1,0.581796,0.743084,-0.077826,0.000461,-0.296702
146042,2014,316056,3420.0,0,0,0,0.0,0.0,0,973.800,...,0.751129,0.004207,-0.037925,0.072050,1,-0.000903,1.063878,-0.002877,0.153133,0.065569
146043,2014,317260,4412.0,0,0,0,0.0,0.0,0,51.743,...,0.018001,0.000000,0.000000,0.000000,1,1.109467,0.000000,0.000000,0.028804,0.000000


In [3]:
train_df = df[(df['fyear'] >= 1991) & (df['fyear'] <= 2001)]
test_df = df[df['fyear'] == 2003]

In [4]:
import numpy as np
train = np.array(train_df)
test = np.array(test_df)
x_train = train[:, 37:52]
x_test = test[:, 37:52]
y_train = train[:, 8]
y_test = test[:, 8]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(67166, 14)
(67166,)
(5981, 14)
(5981,)


## rusboost: 1991-2001, 2003

In [7]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from rusboost import RusBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = RusBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)



AUC:  0.3468173305093056  Aupr:  0.010350747548458331  Ndcg:  0.0  Sensitivity:  0.23852657004830924  for depth=  2  estimators =  20
AUC:  0.34483781451628714  Aupr:  0.007939414162763388  Ndcg:  0.0  Sensitivity:  0.17788366090927135  for depth=  2  estimators =  30
AUC:  0.33753750661881504  Aupr:  0.006069495184083413  Ndcg:  0.0  Sensitivity:  0.11097924982917969  for depth=  2  estimators =  40
AUC:  0.5938462179600321  Aupr:  0.014651322866458875  Ndcg:  0.0  Sensitivity:  0.6185278428707907  for depth=  12  estimators =  20
AUC:  0.5825219156321705  Aupr:  0.013980141894485565  Ndcg:  0.0  Sensitivity:  0.5916906484146119  for depth=  12  estimators =  30
AUC:  0.5942163813221941  Aupr:  0.014541408582806704  Ndcg:  0.011582203012937671  Sensitivity:  0.592438383869653  for depth=  12  estimators =  40


## cusboost: 1991-2001, 2003 

In [8]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from cusboost import CUSBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.6179485105214646  Aupr:  0.020333978982042974  Ndcg:  0.01512410059111566  Sensitivity:  0.6234279554437657  for depth=  2  estimators =  20
AUC:  0.6153757525837893  Aupr:  0.01896818247232268  Ndcg:  0.011729143852774506  Sensitivity:  0.5751454981170833  for depth=  2  estimators =  30
AUC:  0.6217629091408288  Aupr:  0.018815270355649373  Ndcg:  0.02772639591258839  Sensitivity:  0.5744079179922235  for depth=  2  estimators =  40
AUC:  0.6217531034888509  Aupr:  0.017603776641668314  Ndcg:  0.040666295888730775  Sensitivity:  0.6042213966309659  for depth=  12  estimators =  20
AUC:  0.6723980702476908  Aupr:  0.018031050417368834  Ndcg:  0.012316808296780965  Sensitivity:  0.630738245900524  for depth=  12  estimators =  30
AUC:  0.6380243572395129  Aupr:  0.01721497069697694  Ndcg:  0.014231278356345666  Sensitivity:  0.6097729572938513  for depth=  12  estimators =  40


# Experiments on indicators+raw data

In [10]:
import numpy as np
train = np.array(train_df)
test = np.array(test_df)
x_train = train[:, 9:52]
x_test = test[:, 9:52]
y_train = train[:, 8]
y_test = test[:, 8]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(67166, 42)
(67166,)
(5981, 42)
(5981,)


## rusboost: 1991-2001, 2003

In [11]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from rusboost import RusBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = RusBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.7016703928144182  Aupr:  0.0174441755757427  Ndcg:  0.0  Sensitivity:  0.6176899840477887  for depth=  2  estimators =  20
AUC:  0.2274653860485184  Aupr:  0.005645632188443615  Ndcg:  0.0  Sensitivity:  0.3145539906103287  for depth=  2  estimators =  30
AUC:  0.7050251514973231  Aupr:  0.020351549034871998  Ndcg:  0.04358354002659847  Sensitivity:  0.606903706498161  for depth=  2  estimators =  40
AUC:  0.6796272871683238  Aupr:  0.019197217897951337  Ndcg:  0.0  Sensitivity:  0.6721472748612045  for depth=  12  estimators =  20
AUC:  0.7094609833107804  Aupr:  0.020505104381434606  Ndcg:  0.0  Sensitivity:  0.6306468486979142  for depth=  12  estimators =  30
AUC:  0.6960995567845306  Aupr:  0.020415732699706462  Ndcg:  0.06031119743240802  Sensitivity:  0.6658772196529634  for depth=  12  estimators =  40


## cusboost: 1991-2001, 2003 

In [12]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from cusboost import CUSBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.7158910395952227  Aupr:  0.03778601762169977  Ndcg:  0.055633611071660885  Sensitivity:  0.16060069512754932  for depth=  2  estimators =  20
AUC:  0.7311792767351101  Aupr:  0.042508096506897394  Ndcg:  0.0925270432615379  Sensitivity:  0.30317905563347364  for depth=  2  estimators =  30
AUC:  0.7366226392892864  Aupr:  0.03999433528113496  Ndcg:  0.07823450567939391  Sensitivity:  0.3298550724637681  for depth=  2  estimators =  40
AUC:  0.7587294816732364  Aupr:  0.026033143543372755  Ndcg:  0.06334378031702494  Sensitivity:  0.6866966127072917  for depth=  12  estimators =  20
AUC:  0.7437366397991803  Aupr:  0.023916031688165307  Ndcg:  0.0854611057268998  Sensitivity:  0.7021305658000006  for depth=  12  estimators =  30
AUC:  0.7323547292659489  Aupr:  0.02484844433029919  Ndcg:  0.07631191859914224  Sensitivity:  0.6666756405079195  for depth=  12  estimators =  40


# tune for cusboost

## percentage_to_choose_from_each_cluster=0.1

In [4]:
import numpy as np
train = np.array(train_df)
test = np.array(test_df)
x_train = train[:, 9:37]
x_test = test[:, 9:37]
y_train = train[:, 8]
y_test = test[:, 8]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(67166, 28)
(67166,)
(5981, 28)
(5981,)


In [14]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from cusboost import CUSBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)

AUC:  0.701325969288698  Aupr:  0.05208438052032662  Ndcg:  0.08975287826902582  Sensitivity:  0.1361611876988335  for depth=  2  estimators =  20
AUC:  0.7131025573140359  Aupr:  0.03833143513801246  Ndcg:  0.05789726332020634  Sensitivity:  0.23257074704542996  for depth=  2  estimators =  30
AUC:  0.7286714812417879  Aupr:  0.05153673392167279  Ndcg:  0.11107386934940797  Sensitivity:  0.25212267603571953  for depth=  2  estimators =  40
AUC:  0.7315739542272166  Aupr:  0.025944167834694835  Ndcg:  0.175042858371351  Sensitivity:  0.7156262781785451  for depth=  12  estimators =  20
AUC:  0.7460483222529466  Aupr:  0.025134026293882922  Ndcg:  0.12593436272403927  Sensitivity:  0.7221382974817333  for depth=  12  estimators =  30
AUC:  0.773010188072405  Aupr:  0.027329687725959912  Ndcg:  0.06927043290452288  Sensitivity:  0.6911510559564144  for depth=  12  estimators =  40


## 聚成少数样本数量/5个类，每个类选5个代表

In [None]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from cusboost import CUSBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)



AUC:  0.46139759957639587  Aupr:  0.013559538556611414  Ndcg:  0.024369845620875998  Sensitivity:  0.22308454893074636  for depth=  2  estimators =  20
AUC:  0.5741650487340904  Aupr:  0.014357153393553649  Ndcg:  0.020594908769652323  Sensitivity:  0.33025661329835937  for depth=  2  estimators =  30


## 轮廓系数评估聚类效果 

In [5]:
import warnings
from scipy import interp
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score
from cusboost import CUSBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
warnings.filterwarnings('ignore')

def get_dcg(y_pred, y_true, k):
    #注意y_pred与y_true必须是一一对应的，并且y_pred越大越接近label=1(用相关性的说法就是，与label=1越相关)
    df = pd.DataFrame({"y_pred":y_pred, "y_true":y_true})
    df = df.sort_values(by="y_pred", ascending=False)  # 对y_pred进行降序排列，越排在前面的，越接近label=1
    df = df.iloc[0:k, :]  # 取前K个
    dcg = (2 ** df["y_true"] - 1) / np.log2(np.arange(1, df["y_true"].count()+1) + 1) # 位置从1开始计数
    dcg = np.sum(dcg)
    return dcg
    
def get_ndcg(df, k):
    # df包含y_pred和y_true
    dcg = get_dcg(df["y_pred"], df["y_true"], k)
    idcg = get_dcg(df["y_true"], df["y_true"], k)
    ndcg = dcg / idcg
    return ndcg

normalization_object = Normalizer()
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5

for depth in range(2, 20, 10):
    for estimators in range(20, 50, 10):
        current_param_auc = []
        current_param_aupr = []
        current_param_ndcg = []
        current_param_recall = []
        tprs = []
        classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
        classifier.fit(x_train, y_train)
        predictions = classifier.predict_proba_samme(x_test)
        
        auc = roc_auc_score(y_test, predictions[:, 1])
        df_ndcg = pd.DataFrame({"y_pred":predictions[:,1], "y_true":y_test})
        ndcg = get_ndcg(df_ndcg,60)
        precision, recall, _ = precision_recall_curve(y_test, predictions[:, 1])
        
        current_param_auc.append(auc)
        current_param_aupr.append(precision)
        current_param_ndcg.append(ndcg)
        current_param_recall.append(recall)
        
        # thresholds阈值表示分类器认为某个样本具有多大概率属于正样本
        fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        current_mean_auc = np.mean(np.array(current_param_auc))
        current_mean_aupr = np.mean(np.array(current_param_aupr))
        current_mean_ndcg = np.mean(np.array(current_param_ndcg))
        current_mean_recall = np.mean(np.array(current_param_recall))
        
        print('AUC: ', current_mean_auc, ' Aupr: ', current_mean_aupr, ' Ndcg: ', current_mean_ndcg,' Sensitivity: ', current_mean_recall, ' for depth= ', depth, ' estimators = ', estimators)



silhouette score:  0.7638368401930733
silhouette score:  0.76479482169199
silhouette score:  0.7653440680013068
silhouette score:  0.7581347836029769
silhouette score:  0.7683766836431736
silhouette score:  0.7574386385632482
silhouette score:  0.7752425764613862
silhouette score:  0.7524348611814686
silhouette score:  0.7533231619482416
silhouette score:  0.7580682434245501
silhouette score:  0.778907195073082
silhouette score:  0.7684405955595972
silhouette score:  0.7426811970190056
silhouette score:  0.7672508542841102
silhouette score:  0.768156899697245
silhouette score:  0.758655004995363
silhouette score:  0.77570244797027
silhouette score:  0.7818231172481108
silhouette score:  0.7565689134463045
silhouette score:  0.7573163447564456
AUC:  0.7237294326449765  Aupr:  0.054029582009819485  Ndcg:  0.061596617215299  Sensitivity:  0.20697089471793026  for depth=  2  estimators =  20
silhouette score:  0.7695270223509991
silhouette score:  0.764648921729599
silhouette score:  0.764

KeyboardInterrupt: 