# Data preprocessing
#### Read unlabelled data and labelled data

In [1]:
import pandas as pd
import numpy as np
numerical_feature = np.load('open_data_train_valid/float64.npy')
factor_feature   = np.load('open_data_train_valid/int64.npy')
chunksize = 10000
labelled_iter = pd.read_csv('open_data_train_valid/train/train_1.txt',chunksize=chunksize,sep='\t')
labelled_data = labelled_iter.get_chunk()
columns = labelled_data.columns
unlabelled_iter = pd.read_csv('open_data_train_valid/train/train_4.txt',chunksize = chunksize,sep='\t',names=columns)
unlabelled_data = unlabelled_iter.get_chunk()

#select features
numerical_ldata = labelled_data[numerical_feature]
factor_ldata = labelled_data[factor_feature]

#labelled data
numerical_undata = unlabelled_data[numerical_feature]
factor_undata = unlabelled_data[factor_feature]

#unlabelled data
numerical_data = pd.concat([numerical_ldata,numerical_undata],axis=0,ignore_index=False)
factor_data = pd.concat([factor_ldata,factor_undata],axis=0,ignore_index=False)

label = labelled_data['label']

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#scaling numerical data
numerical_data.fillna(0,inplace=True)
numerical_ldata.fillna(0,inplace=True)
numerical_undata.fillna(0,inplace=True)

scaler = StandardScaler()
scaler.fit(numerical_data)
numerical_feat = scaler.transform(numerical_data)
numerical_lfeat = scaler.transform(numerical_ldata)
numerical_unfeat = scaler.transform(numerical_undata)

numerical_data = pd.DataFrame(numerical_feat,index = numerical_data.index,columns=numerical_data.columns)
numerical_ldata = pd.DataFrame(numerical_lfeat,index = numerical_ldata.index,columns=numerical_ldata.columns)
numerical_undata = pd.DataFrame(numerical_unfeat,index = numerical_undata.index,columns=numerical_undata.columns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


# Train labelled model

In [7]:
import xgboost as xgb
data_fit = pd.concat([numerical_ldata,factor_ldata],axis=1)
X_train, X_test, y_train, y_test = train_test_split(data_fit,label,test_size = 0.3,random_state = None)

params = {
    'booster' : 'gbtree',
    'objective': 'binary:logistic',
    'gamma': 0.1, #minimum loss reduction required to make a further partition on a leaf node of the tree
    'max_depth': 5,
    'gamma': 2, #L1 penalty
    'subsample' : 0.8, #proportion of train set used to train
    'colsample_tree' : 1, #proportion of sample of features
    'min_child_weight': 1,
    'silent': 0, #print process
    'eta' : 0.01, # step size
    'scale_pos_weight': 15
}
plst = params.items()

dtrain = xgb.DMatrix(X_train,y_train)
num_rounds = 250
model = xgb.train(plst,dtrain,num_rounds)

[12:44:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[12:44:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[12:44:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[12:44:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 2 pruned nodes, max_depth=5
[12:44:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[12:44:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=5
[12:44:04] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[12:44:04] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 2 pruned nodes, max_depth=5
[12:44:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 2 pruned nodes, max_

[12:44:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=5
[12:44:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 2 pruned nodes, max_depth=5
[12:44:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 2 pruned nodes, max_depth=5
[12:44:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=5
[12:44:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[12:44:46] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=5
[12:44:47] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[12:44:47] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[12:44:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_

[12:45:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=5
[12:45:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=5
[12:45:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=5
[12:45:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=5
[12:45:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=5
[12:45:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[12:45:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=5
[12:45:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[12:45:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 2 pruned nodes, max_

[12:46:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 2 pruned nodes, max_depth=5
[12:46:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 2 pruned nodes, max_depth=5
[12:46:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[12:46:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=5
[12:46:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=5
[12:46:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 4 pruned nodes, max_depth=5
[12:46:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=5
[12:46:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=5
[12:46:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 2 pruned nodes, max_

# Evaluation

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

dtest = xgb.DMatrix(X_test)
y_pred = model.predict(dtest)
auc = roc_auc_score(y_true = y_test,y_score = y_pred)
print('auc: ',auc)

auc:  0.790684909409859


# Rejection Inference

In [15]:
data_unfit = pd.concat([numerical_undata,factor_undata],axis=1)
dunlabelled = xgb.DMatrix(data_unfit)
weights = model.predict(dunlabelled)

X_ = pd.concat([data_fit,data_unfit,data_unfit],axis=0,ignore_index = True)
labelled_data_sample_size = labelled_data.shape[0]
unlabelled_data_sample_size = unlabelled_data.shape[0]

sample_weights = np.concatenate((np.ones(labelled_data_sample_size),
                           weights,
                           1-weights))
Y_ = np.concatenate([label,np.ones(unlabelled_data_sample_size),np.zeros(unlabelled_data_sample_size)])

X_rtrain, X_rtest, y_rtrain, y_rtest = train_test_split(X_,Y_,test_size = 0.3,random_state = None)

In [30]:
params = {
    'booster' : 'gbtree',
    'objective': 'binary:logistic',
    'gamma': 0.1, #minimum loss reduction required to make a further partition on a leaf node of the tree
    'max_depth': 5,
    'gamma': 2, #L1 penalty
    'subsample' : 0.8, #proportion of train set used to train
    'colsample_tree' : 1, #proportion of sample of features
    'min_child_weight': 1,
    'silent': 0, #print process
    'eta' : 0.01, # step size
    'scale_pos_weight': 2
}
plst = params.items()

drtrain = xgb.DMatrix(X_rtrain,y_rtrain)
num_rounds = 400
model_r = xgb.train(plst,drtrain,num_rounds)

[13:00:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[13:00:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:00:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_depth=5
[13:00:58] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_depth=5
[13:01:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_depth=5
[13:01:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:01:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:01:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:01:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_

[13:03:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[13:03:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:03:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[13:03:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:03:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:03:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[13:03:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 2 pruned nodes, max_depth=5
[13:03:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[13:03:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_

[13:05:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[13:05:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[13:05:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 2 pruned nodes, max_depth=5
[13:05:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=5
[13:05:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:05:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:05:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:05:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[13:05:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 2 pruned nodes, max_

[13:07:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:07:58] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_depth=5
[13:08:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_depth=5
[13:08:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:08:04] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:08:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[13:08:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[13:08:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:08:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_

[13:11:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:11:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 2 pruned nodes, max_depth=5
[13:11:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 2 pruned nodes, max_depth=5
[13:11:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:11:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 2 pruned nodes, max_depth=5
[13:11:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:11:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[13:11:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:11:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 2 pruned nodes, max_

[13:13:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:13:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 2 pruned nodes, max_depth=5
[13:13:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 2 pruned nodes, max_depth=5
[13:13:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[13:13:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[13:13:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=5
[13:13:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 4 pruned nodes, max_depth=5
[13:13:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_depth=5
[13:13:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_

In [31]:
drtest = xgb.DMatrix(X_rtest)
y_rpred = model.predict(drtest)
auc = roc_auc_score(y_true = y_rtest,y_score = y_rpred)
print('auc: ',auc)

auc:  0.5533865169818426


In [32]:
score_region = np.linspace(0,1,10)

In [33]:
score_region

array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])

In [76]:
fanwei=list(np.linspace(0,1,11))

# Rejection Inference

In [None]:
#model predict label for bad
#predict label for good data
weight_good = model.predict(X_train)
region = list(np.linspace(0,1,11))
group_good = pd.cut(weight_good,region,right = False)
frequency  = np.ravel(group_good.codes)

percentage  = pd.Series(frequency).value_counts()
X_train['frequency'] = frequency
X_train['label'] = label

good_bad_rate = []
types = percentage.index.values
for i in types:
    select = X_train[X_train['frequency'] == i]
    total_amount = select.shape[0]
    amount = select['label'].value_counts[1]
    good_bad_rate.append(amount/total_amount*2)

y_predict_bad = model.predict(X_bad)
group_bad = pd.cut(y_predict_bad,region,right = False)
frequency_bad = np.ravel(group_bad.codes)
X_bad['label'] = np.zeros(X_bad.shape[0])
X_bad['frequency'] = frequency_bad
for i in types:
    temp = X_bad[X_bad['frequency']==i]
    size = int(temp.shape[0] * good_bad_rate[i])
    choice = np.choice(temp.index.values,size,replacement=False)
    X_bad[choice] = 1

    

In [112]:
pd.Series(np.ravel(fenzu.codes)).value_counts()

1    777
2    620
0    489
3    433
4    307
5    223
6    111
7     35
8      5
dtype: int64

In [113]:
fenzu.categories

Index(['(0, 0.1]', '(0.1, 0.2]', '(0.2, 0.3]', '(0.3, 0.4]', '(0.4, 0.5]',
       '(0.5, 0.6]', '(0.6, 0.7]', '(0.7, 0.8]', '(0.8, 0.9]', '(0.9, 1]'],
      dtype='object')