# NLP特征优化+XGBoost建模+贝叶斯优化器

## 1.NLP Feature Optimization

I found there were many columns related to ID, such as `merchant_id`, `merchant_category_id`, `state_id`, `subsector_id`, `city_id`, these values inflect customer's behavior. For example, for a customer A, if a certain merchant id happens a lot in his transaction record, it means A likes this merchant. Furthermore, if this merchant record in many customers', it means this merchant is popular, and it also means customer A is similar to other customers, if not, A has a special like. 

In order to mining this information, I will apply CountVector and TF-IDF. Specificly, CountVector can extract merchant information of a customer, and TF-IDF can extract if many customers like one product at the same time.

If we apply NLP approaches, there is an issue we should consider which is there are too many new features and most of them are sparse. So I'll introduce associated method `sparse` from `scipy`.

In [85]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse

In [2]:
# Note, these are the original dataset
train = pd.read_csv('data/train.csv')
test =  pd.read_csv('data/test.csv')
merchant = pd.read_csv('data/merchants.csv')
new_transaction = pd.read_csv('data/new_merchant_transactions.csv')
history_transaction = pd.read_csv('data/historical_transactions.csv')
transaction = pd.concat([new_transaction, history_transaction], axis=0, ignore_index=True)
del new_transaction
del history_transaction
gc.collect()

0

In [3]:
nlp_features = ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']

for co in nlp_features:
    print(co)
    transaction[co] = transaction[co].astype(str)
    temp = transaction[transaction['month_lag']>=0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_new']
    train = pd.merge(train, temp, how='left', on='card_id')
    test = pd.merge(test, temp, how='left', on='card_id')

    temp = transaction[transaction['month_lag']<0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_hist']
    train = pd.merge(train, temp, how='left', on='card_id')
    test = pd.merge(test, temp, how='left', on='card_id')

    temp = transaction.groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_all']
    train = pd.merge(train, temp, how='left', on='card_id').fillna("-1")
    test = pd.merge(test, temp, how='left', on='card_id').fillna("-1")

merchant_id
merchant_category_id
state_id
subsector_id
city_id


In [4]:
# 创建空DataFrame用于保存NLP特征
train_x = pd.DataFrame()
test_x = pd.DataFrame()

# 实例化CountVectorizer评估器与TfidfVectorizer评估器
cntv = CountVectorizer()
tfv = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)

# 创建空列表用户保存修正后的列名称
vector_feature =[]
for co in ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']:
    vector_feature.extend([co+'_new', co+'_hist', co+'_all'])
    
# 提取每一列进行新特征衍生
for feature in vector_feature:
    print(feature)
    cntv.fit([feature].append(test[feature]))
    train_x = sparse.hstack((train_x, cntv.transform(train[feature]))).tocsr()
    test_x = sparse.hstack((test_x, cntv.transform(test[feature]))).tocsr()
    
    tfv.fit(train[feature].append(test[feature]))
    train_x = sparse.hstack((train_x, tfv.transform(train[feature]))).tocsr()
    test_x = sparse.hstack((test_x, tfv.transform(test[feature]))).tocsr()
    
# 保存NLP特征衍生结果
sparse.save_npz("preprocess/train_nlp.npz", train_x)
sparse.save_npz("preprocess/test_nlp.npz", test_x)

merchant_id_new
merchant_id_hist
merchant_id_all
merchant_category_id_new
merchant_category_id_hist
merchant_category_id_all
state_id_new
state_id_hist
state_id_all
subsector_id_new
subsector_id_hist
subsector_id_all
city_id_new
city_id_hist
city_id_all


In [5]:
train_x.shape

(201917, 1846286)

## 2.XGBoost

In [11]:
import xgboost as xgb
from sklearn.feature_selection import f_regression
from numpy.random import RandomState
from bayes_opt import BayesianOptimization

In [13]:
train = pd.read_csv('preprocess/train.csv')
test = pd.read_csv('preprocess/test.csv')

In [14]:
features = train.columns.tolist()
features.remove('card_id')
features.remove('target')

train_x = sparse.load_npz("preprocess/train_nlp.npz")
test_x = sparse.load_npz("preprocess/test_nlp.npz")

train_x = sparse.hstack((train_x, train[features])).tocsr()
test_x = sparse.hstack((test_x, test[features])).tocsr()

In [15]:
# 参数回调函数
def params_append(params):
    """

    :param params:
    :return:
    """
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    params["min_child_weight"] = int(params["min_child_weight"])
    params['max_depth'] = int(params['max_depth'])
    return params

# 模型优化函数
def param_beyesian(train):
    """

    :param train:
    :return:
    """
    # Part 1.数据准备
    train_y = pd.read_csv("data/train.csv")['target']
    # 数据封装
    sample_index = train_y.sample(frac=0.1, random_state=2020).index.tolist()
    train_data = xgb.DMatrix(train.tocsr()[sample_index, :
                             ], train_y.loc[sample_index].values, silent=True)
    
    # 借助cv过程构建目标函数
    def xgb_cv(colsample_bytree, subsample, min_child_weight, max_depth,
               reg_alpha, eta,
               reg_lambda):
        """

        :param colsample_bytree:
        :param subsample:
        :param min_child_weight:
        :param max_depth:
        :param reg_alpha:
        :param eta:
        :param reg_lambda:
        :return:
        """
        params = {'objective': 'reg:squarederror',
                  'early_stopping_round': 50,
                  'eval_metric': 'rmse'}
        params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
        params['subsample'] = max(min(subsample, 1), 0)
        params["min_child_weight"] = int(min_child_weight)
        params['max_depth'] = int(max_depth)
        params['eta'] = float(eta)
        params['reg_alpha'] = max(reg_alpha, 0)
        params['reg_lambda'] = max(reg_lambda, 0)
        print(params)
        cv_result = xgb.cv(params, train_data,
                           num_boost_round=1000,
                           nfold=2, seed=2,
                           stratified=False,
                           shuffle=True,
                           early_stopping_rounds=30,
                           verbose_eval=False)
        return -min(cv_result['test-rmse-mean'])
    
    # 调用贝叶斯优化器进行模型优化
    xgb_bo = BayesianOptimization(
        xgb_cv,
        {'colsample_bytree': (0.5, 1),
         'subsample': (0.5, 1),
         'min_child_weight': (1, 30),
         'max_depth': (5, 12),
         'reg_alpha': (0, 5),
         'eta':(0.02, 0.2),
         'reg_lambda': (0, 5)}
    )
    xgb_bo.maximize(init_points=21, n_iter=5)  # init_points表示初始点，n_iter代表迭代次数（即采样数）
    print(xgb_bo.max['target'], xgb_bo.max['params'])
    return xgb_bo.max['params']

# 交叉验证预测函数
def train_predict(train, test, params):
    """

    :param train:
    :param test:
    :param params:
    :return:
    """
    train_y = pd.read_csv("data/train.csv")['target']
    test_data = xgb.DMatrix(test)

    params = params_append(params)
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series()
    ESR = 30
    NBR = 10000
    VBE = 50
    for train_part_index, eval_index in kf.split(train, train_y):
        # 模型训练
        train_part = xgb.DMatrix(train.tocsr()[train_part_index, :],
                                 train_y.loc[train_part_index])
        eval = xgb.DMatrix(train.tocsr()[eval_index, :],
                           train_y.loc[eval_index])
        bst = xgb.train(params, train_part, NBR, [(train_part, 'train'),
                                                          (eval, 'eval')], verbose_eval=VBE,
                        maximize=False, early_stopping_rounds=ESR, )
        prediction_test += bst.predict(test_data)
        eval_pre = bst.predict(eval)
        prediction_train = prediction_train.append(pd.Series(eval_pre, index=eval_index))
        score = np.sqrt(mean_squared_error(train_y.loc[eval_index].values, eval_pre))
        cv_score.append(score)
    print(cv_score, sum(cv_score) / 5)
    pd.Series(prediction_train.sort_index().values).to_csv("preprocess/train_xgboost.csv", index=False)
    pd.Series(prediction_test / 5).to_csv("preprocess/test_xgboost.csv", index=False)
    test = pd.read_csv('data/test.csv')
    test['target'] = prediction_test / 5
    test[['card_id', 'target']].to_csv("result/submission_xgboost.csv", index=False)
    return

In [16]:
best_clf = param_beyesian(train_x)

|   iter    |  target   | colsam... |    eta    | max_depth | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
{'objective': 'reg:squarederror', 'early_stopping_round': 50, 'eval_metric': 'rmse', 'colsample_bytree': 0.9008280232486963, 'subsample': 0.8904106813206965, 'min_child_weight': 29, 'max_depth': 9, 'eta': 0.15949688000040066, 'reg_alpha': 1.8850960576404892, 'reg_lambda': 4.436160789463865}
Parameters: { "early_stopping_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stopping_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being

In [17]:
train_predict(train_x, test_x, best_clf)

  prediction_train = pd.Series()


[0]	train-rmse:3.93743	eval-rmse:3.94887
[50]	train-rmse:3.49914	eval-rmse:3.73093
[100]	train-rmse:3.32144	eval-rmse:3.68839
[150]	train-rmse:3.23772	eval-rmse:3.67804
[200]	train-rmse:3.20197	eval-rmse:3.67385
[250]	train-rmse:3.17762	eval-rmse:3.67253
[300]	train-rmse:3.15443	eval-rmse:3.67234
[302]	train-rmse:3.15403	eval-rmse:3.67244
[0]	train-rmse:3.94448	eval-rmse:3.92137
[50]	train-rmse:3.50756	eval-rmse:3.69474
[100]	train-rmse:3.32726	eval-rmse:3.64895
[150]	train-rmse:3.24642	eval-rmse:3.63666
[200]	train-rmse:3.20761	eval-rmse:3.63409
[250]	train-rmse:3.18269	eval-rmse:3.63341
[300]	train-rmse:3.15935	eval-rmse:3.63319
[350]	train-rmse:3.13936	eval-rmse:3.63291
[354]	train-rmse:3.13787	eval-rmse:3.63293
[0]	train-rmse:3.93301	eval-rmse:3.96766
[50]	train-rmse:3.48744	eval-rmse:3.75187
[100]	train-rmse:3.30316	eval-rmse:3.71049
[150]	train-rmse:3.21876	eval-rmse:3.70025
[200]	train-rmse:3.18515	eval-rmse:3.69695
[250]	train-rmse:3.16190	eval-rmse:3.69627
[300]	train-rmse:3.1

| Model | RMSE | 
| ------ | ------ |
| randomforest | 3.65455 | 
| randomforest+validation | 3.65173 | 
| LightGBM | 3.69723 |
| LightGBM+validation | 3.64403 |
| XGBoost | 3.62832 | 

---

# Ensemble training
## Voting 

Voting can be simply implemented weighted sum of results from models

### 1.Voting with average

In [90]:
data = pd.read_csv("./result/submission_randomforest.csv")
data['randomforest'] = data['target'].values

temp = pd.read_csv("./result/submission_lightgbm.csv")
data['lightgbm'] = temp['target'].values


temp = pd.read_csv("./result/submission_xgboost.csv")
data['xgboost'] = temp['target'].values

print(data.corr())

                target  randomforest  lightgbm   xgboost
target        1.000000      1.000000  0.956251  0.947529
randomforest  1.000000      1.000000  0.956251  0.947529
lightgbm      0.956251      0.956251  1.000000  0.951461
xgboost       0.947529      0.947529  0.951461  1.000000


In [91]:
data.head(5)

Unnamed: 0,card_id,target,randomforest,lightgbm,xgboost
0,C_ID_0ab67a22ab,-3.528347,-3.528347,-3.576328,-3.73784
1,C_ID_130fd0cbdd,-0.789489,-0.789489,-0.866846,-0.540275
2,C_ID_b709037bc5,-0.380266,-0.380266,-0.370968,-0.433453
3,C_ID_d27d835a9f,-0.269844,-0.269844,-0.121097,-0.176081
4,C_ID_2b5e3df5c2,-1.038557,-1.038557,-1.047017,-1.027519


In [92]:
data['target'] = (data['randomforest'] + data['lightgbm'] + data['xgboost']) / 3

In [93]:
data[['card_id','target']].to_csv("result/voting_avr.csv", index=False)

| Approach | RMSE |
| -- | -- |
| Base line | 3.65455 |
| RF + CV | 3.65173 |
| LightGBM | 3.69732 |
| LightGBM + CV| 3.64403 |
| XGBoost | 3.62832 |
| Average Voting | 3.6365 |

### 2. Weighted Average

In [100]:
data['target'] = data['randomforest']*0.2+data['lightgbm']*0.3 + data['xgboost']*0.5
data[['card_id','target']].to_csv("result/voting_wei1.csv", index=False)

| Approach | RMSE |
| -- | -- |
| Base line | 3.65455 |
| RF + CV | 3.65173 |
| LightGBM | 3.69732 |
| LightGBM + CV| 3.64403 |
| XGBoost | 3.62832 |
| Average Voting | 3.6365 |
| weighted Voting | 3.633307 |

---

## Stacking 

In [81]:
oof_rf  = pd.read_csv('./dataset/preprocess/train_randomforest.csv')
predictions_rf  = pd.read_csv('./dataset/preprocess/test_randomforest.csv')

oof_lgb  = pd.read_csv('./dataset/preprocess/train_lightgbm.csv')
predictions_lgb  = pd.read_csv('./dataset/preprocess/test_lightgbm.csv')

oof_xgb  = pd.read_csv('./dataset/preprocess/train_xgboost.csv')
predictions_xgb  = pd.read_csv('./dataset/preprocess/test_xgboost.csv')

In [101]:
oof_rf.head(5)

Unnamed: 0,0
0,-0.426984
1,-1.857115
2,0.402431
3,-0.063488
4,-0.278658


In [103]:
predictions_lgb.head(5)

Unnamed: 0,0
0,-3.576328
1,-0.866846
2,-0.370968
3,-0.121097
4,-1.047017


In [104]:
oof_rf.shape, oof_lgb.shape

((201917, 1), (201917, 1))

In [105]:
predictions_rf.shape, predictions_lgb.shape

((123623, 1), (123623, 1))

In [80]:
def stack_model(oof_1, oof_2, oof_3, predictions_1, predictions_2, predictions_3, y):
   
    # Part 1.数据准备
    # 按行拼接列，拼接验证集所有预测结果
    # train_stack就是final model的训练数据
    train_stack = np.hstack([oof_1, oof_2, oof_3])
    # 按行拼接列，拼接测试集上所有预测结果
    # test_stack就是final model的测试数据
    test_stack = np.hstack([predictions_1, predictions_2, predictions_3])
    # 创建一个和验证集行数相同的全零数组
    # oof = np.zeros(train_stack.shape[0])
    # 创建一个和测试集行数相同的全零数组
    predictions = np.zeros(test_stack.shape[0])
    
    # Part 2.多轮交叉验证
    from sklearn.model_selection import RepeatedKFold
    folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2020)
    
    # fold_为折数，trn_idx为每一折训练集index，val_idx为每一折验证集index
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, y)):
        # 打印折数信息
        print("fold n°{}".format(fold_+1))
        # 训练集中划分为训练数据的特征和标签
        trn_data, trn_y = train_stack[trn_idx], y[trn_idx]
        # 训练集中划分为验证数据的特征和标签
        val_data, val_y = train_stack[val_idx], y[val_idx]
        # 开始训练时提示
        print("-" * 10 + "Stacking " + str(fold_+1) + "-" * 10)
        # 采用贝叶斯回归作为结果融合的模型（final model）
        clf = BayesianRidge()
        # 在训练数据上进行训练
        clf.fit(trn_data, trn_y)
        # 在验证数据上进行预测，并将结果记录在oof对应位置
        # oof[val_idx] = clf.predict(val_data)
        # 对测试集数据进行预测，每一轮预测结果占比额外的1/10
        predictions += clf.predict(test_stack) / (5 * 2)
    
    # 返回测试集的预测结果
    return predictions

In [82]:
target = train['target'].values

In [83]:
predictions_stack  = stack_model(oof_rf, oof_lgb, oof_xgb, 
                                 predictions_rf, predictions_lgb, predictions_xgb, target)

fold n°1
----------Stacking 1----------
fold n°2
----------Stacking 2----------
fold n°3
----------Stacking 3----------
fold n°4
----------Stacking 4----------
fold n°5
----------Stacking 5----------
fold n°6
----------Stacking 6----------
fold n°7
----------Stacking 7----------
fold n°8
----------Stacking 8----------
fold n°9
----------Stacking 9----------
fold n°10
----------Stacking 10----------


In [78]:
predictions_stack

array([-4.05848444, -0.68964305, -0.41582154, ...,  0.65614588,
       -2.35990886,  0.39282516])

In [79]:
sub_df = pd.read_csv('data/sample_submission.csv')
sub_df["target"] = predictions_stack
sub_df.to_csv('predictions_stack1.csv', index=False)

| Approach | RMSE |
| -- | -- |
| Base line | 3.65455 |
| RF + CV | 3.65173 |
| LightGBM | 3.69732 |
| LightGBM + CV| 3.64403 |
| XGBoost | 3.62832 |
| Average Voting | 3.6365 |
| weighted Voting | 3.633307 |
| Stacking | 3.62798 |