# Step4: XGBOOST 单模型
需要调整的参数在trainToPredict函数里面
**11-07** v2版本的改动：<br>
1. 修改文件编码为utf-8
2. 输出更多调试信息
3. 调整输出文件后缀为%Y%m%d_%H%M%S

TODO:
1. 利用 MinMaxScaler处理predict值，使得predict值在[0,1]之间

Input：
- Step3产出的经过特征工程处理的特征，../feature/features_all_train_test_20181106.csv

Output：
- 经过训练、交叉验证得到的KS值
- 对线上数据进行预测，模型输出在这里 "../model/"
- 预测结果输出在"../answer/"


In [1]:



import datetime
import pandas as pd
import xgboost as xgb
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

# common prefix
TRAIN_PREFIX = "../train/"
TEST_PREFIX = "../B/"
FEATURE_PREFIX = "../feature/"

# File Encoder
FILE_ENCODER = "utf-8"

def calcKS(y_predict, y_positive):
    label = y_positive
    fpr, tpr, thres = metrics.roc_curve(label, y_predict, pos_label = 1)
    return abs(fpr - tpr).max()

def useModelToPredict(model_file_name, feature_file_name):
    all_data= pd.read_csv(feature_file_name,encoding=FILE_ENCODER) 
    predict_set=all_data[(all_data['标签']<0)]
    print("predict_set shape:")
    print(predict_set.shape)
    ## TO DO
    
def createFeatureMap(features, output_file):  
    outfile = open(output_file, 'w')  
    i = 0  
    for feat in features:  
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))  
        i = i + 1  
    outfile.close()

def trainToPredict(data_set, predict_set):
    #  PARAMS TUNING BEGIN
    #  需要调整的参数：
    params={'booster':'gbtree',
    'objective': 'binary:logistic',
    'eval_metric':'auc',
    'gamma':1,
    'min_child_weight':1,
    'max_depth':6,
    'lambda':3,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'colsample_bylevel':0.7,
    'eta': 0.01,
    'tree_method':'exact',
    'seed':1000,
    'nthread':24
    }
    
    TEST_SIZE = 0.2
    RANDOM_STATE = 21
    
    NUM_BOOST_ROUND = 7000;
    EARLY_STOP_ROUNDS = 500;
    # FOR TEST
    #NUM_BOOST_ROUND = 10;
    #EARLY_STOP_ROUNDS = 5;
    # TEST END
    
    #  PARAMS TUNING END
    
    param_list = list(params.items())
    
    train_set, validation_set = train_test_split(data_set, test_size = TEST_SIZE, random_state = RANDOM_STATE)
    test_set = validation_set[:] # same with validation_set
    
    trainY = train_set['标签']
    trainX = train_set.drop(['标签','用户标识'],axis=1)
    validationY = validation_set['标签']
    validationX = validation_set.drop(['标签','用户标识'],axis=1)
    
    testX = test_set.drop(['标签','用户标识'],axis=1)
    predictX = predict_set.drop(['标签','用户标识'],axis=1)
    
    trainX_Mtx = xgb.DMatrix(trainX, label = trainY)
    validationX_Mtx = xgb.DMatrix(validationX, label = validationY)
    testX_Mtx = xgb.DMatrix(testX)
    predictX_Mtx = xgb.DMatrix(predictX)
    
    
    watchlist = [(trainX_Mtx, 'train'),(validationX_Mtx, 'val')]
    
    # train model
    print("begin to train a model by XGBoost")
    time_now = datetime.datetime.now()
    filename = datetime.datetime.strftime(time_now,'%Y%m%d_%H%M%S')
    model = xgb.train(param_list, trainX_Mtx, num_boost_round=NUM_BOOST_ROUND, evals=watchlist, early_stopping_rounds=EARLY_STOP_ROUNDS)
    model.save_model("../model/XGBOOST_"+filename+".model")
    
    print("mode file:"+"../model/XGBOOST_"+filename+".model")
    print("best best_ntree_limit",model.best_ntree_limit)
    print("best best_iteration",model.best_iteration)
    print("XGBoost training finished")
    
    # predict
    predict_testY = model.predict(testX_Mtx, ntree_limit = model.best_iteration)
    predictY = model.predict(predictX_Mtx, ntree_limit = model.best_iteration)
    
    # by GJW 2018-11-11-15:16#######################################
    offline = test_set[['用户标识','标签']]
    offline['预测']=predict_testY
    print("ks of test set:")
    ks = calcKS(offline['预测'], offline['标签'])
    print(ks)
    offline.to_csv('../answer/xgboost_test'+filename+"_"+str(ks)+'.csv', index=None, encoding=FILE_ENCODER)
    ############################################################
      
    
    
    output = predict_set[['用户标识']]
    #output["预测"] = MinMaxScaler().fit_transform(predictY)
    output["预测"] = predictY
    output = output.groupby('用户标识').max().reset_index()
    print("min predict:")
    print(min(output["预测"]))
    print("max predict:")
    print(max(output["预测"]))
    output_file=output[['用户标识','预测']].rename(index=str, columns={"用户标识": "userid", "预测": "probability"})
    output_file.to_csv("../answer/"+filename+"_"+str(ks)+".csv", index=None, header=None, encoding=FILE_ENCODER)
    print("predict file:"+"../answer/"+filename+"_"+str(ks)+".csv")
    # save the score of feature by XGBoost 
    features = [x for x in trainX.columns if x not in ['标签','用户标识']]  
    createFeatureMap(features, "../model/XGBOOST_"+filename+".fmap") 
    print("features file:"+"../model/XGBOOST_"+filename+".fmap")

def removeDuplicateFeature(all_data):
    
    #all_data.drop('放款时间_x',axis=1, inplace=True)
    #all_data.drop('放款时间_y',axis=1, inplace=True)
    #all_data.drop('放款时间_x.1',axis=1, inplace=True)
    #all_data.drop('放款时间_y.1',axis=1, inplace=True)

    #检测重复特征名称,并删除
    names=all_data.columns.tolist()
    print(len(names))
    print(len(set(names)))
    print("all_data大小：")
    print(all_data.shape)
    myset = set(names)
    for item in myset:
        if names.count(item)>1:
            print(item)
            dataset=dataset.drop(item,axis=1)
    print("新all_data大小：")
    print(all_data.shape)
    return all_data
    
def callXGBoost(feature_file_name):
    all_data= pd.read_csv(feature_file_name,encoding=FILE_ENCODER) 

    all_data = removeDuplicateFeature(all_data)
    predict_set=all_data[(all_data['标签']<0)]
    data_set=all_data[(all_data['标签']>=0)]
    print("data_set shape:")
    print(data_set.shape)
    print("predict_set shape:")
    print(predict_set.shape)
    return (data_set, predict_set)
    
    
    



In [2]:
#Step 4.1 执行XGBoost训练
(data_set, predict_set) = callXGBoost("../feature/features_all_train_test_20181106.csv")


FileNotFoundError: File b'../feature/features_all_train_test_20181106.csv' does not exist

In [None]:
trainToPredict(data_set, predict_set)
