In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import gc
import xgboost as xgb
from sklearn.feature_selection import f_regression
from bayes_opt import BayesianOptimization

# Wrapper feature selection, LightGBM, TPE tuning

In [3]:
train = pd.read_csv('./dataset/preprocess/train.csv')
test = pd.read_csv('./dataset/preprocess/test.csv')

## Wrapper feature selection

To train a LightGBM model with 5 folds cross validation, and calculate most important 300 features.

In [12]:
def feature_select_wrapper(train, test):
    """
    Feature selection by LGBM
    :param train:
    :param test:
    :return:
    """
    
    # Part 1.划分特征名称，删除ID列和标签列
    print('feature_select_wrapper...')
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')

    # Step 2.配置lgb参数
    # 模型参数
    params_initial = {
        'num_leaves': 31,
        'learning_rate': 0.1,
        'boosting': 'gbdt',
        'min_child_samples': 20,
        'bagging_seed': 2020,
        'bagging_fraction': 0.7,
        'bagging_freq': 1,
        'feature_fraction': 0.7,
        'max_depth': -1,
        'metric': 'rmse',
        'reg_alpha': 0,
        'reg_lambda': 1,
        'objective': 'regression',
        'verbose': 1
    }
    # 控制参数
    # 提前验证迭代效果或停止
    ESR = 30
    # 迭代次数
    NBR = 10000
    # 打印间隔
    VBE = 50
    
    # Part 3.交叉验证过程
    # 实例化评估器
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    # 创建空容器
    fse = pd.Series(0, index=features)
    
    for train_part_index, eval_index in kf.split(train[features], train[label]):
        # 封装训练数据集
        train_part = lgb.Dataset(train[features].loc[train_part_index],
                                 train[label].loc[train_part_index])
        # 封装验证数据集
        eval = lgb.Dataset(train[features].loc[eval_index],
                           train[label].loc[eval_index])
        # 在训练集上进行训练，并同时进行验证
        bst = lgb.train(params_initial, train_part, num_boost_round=NBR,
                        valid_sets=[train_part, eval],
                        valid_names=['train', 'valid'])
        # 输出特征重要性计算结果，并进行累加
        fse += pd.Series(bst.feature_importance(), features)
    
    # Part 4.选择最重要的300个特征
    feature_select = ['card_id'] + fse.sort_values(ascending=False).index.tolist()[:300]
    print('done')
    return train[feature_select + ['target']], test[feature_select]

In [13]:
train_LGBM, test_LGBM = feature_select_wrapper(train, test)

feature_select_wrapper...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.692528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 227016
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 1626
[LightGBM] [Info] Start training from score -0.390986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.638084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 227122
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 1629
[LightGBM] [Info] Start training from score -0.396781
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.657010 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 227089
[LightGBM] [Info] Number of data points in the train set: 161534, number of used

In [14]:
train_LGBM.shape

(201917, 302)

## Training the LightGBM model and TPE optimization

For those hyperparameters we don't want use the defaults.

In [15]:
def params_append(params):
    """
    动态回调参数函数，params视作字典
    :param params:lgb参数字典
    :return params:修正后的lgb参数字典
    """
    params['feature_pre_filter'] = False
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['bagging_seed'] = 2020,
    params['verbose'] = -1
    return params

In [43]:
def param_hyperopt(train):
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')

    train_data = lgb.Dataset(train[features], train[label])

    def hyperopt_objective(params):
        """
        input hyperparameters, output associated loss
        :params: params
        :return: minimum rmse
        """
        params = params_append(params)
        print(params)

        # Define the early stopping callback function
        early_stopping_callback = lgb.early_stopping(20)
        # Define a callback function to print the standard deviation of the cross-validation scores
        # def print_stdv(cv_results):
        #     cv_results = cv_results.get_cv_folds_results()
        #     print('Standard deviation of cross-validation scores:', np.std(cv_results['rmse-mean']))
        res = lgb.cv(params, train_data, 1000,
                     nfold=2,
                     stratified=False,
                     shuffle=True,
                     metrics='rmse',
                     callbacks=[early_stopping_callback],
                     eval_train_metric=False,
                     seed=2020)
        #print(res)
        return min(res['valid rmse-mean']) # res is a dict
    
    params_space = {
        'learning_rate': hp.uniform('learning_rate', 1e-2, 5e-1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
        'num_leaves': hp.choice('num_leaves', list(range(10, 300, 10))),
        'reg_alpha': hp.randint('reg_alpha', 0, 10),
        'reg_lambda': hp.uniform('reg_lambda', 0, 10),
        'bagging_freq': hp.randint('bagging_freq', 1, 10),
        'min_child_samples': hp.choice('min_child_samples', list(range(1, 30, 5)))
    }

    params_best = fmin(
        hyperopt_objective,
        space=params_space,
        algo=tpe.suggest,
        max_evals=30,
        rstate=np.random.default_rng(2020))
    
    # 返回最佳参数
    return params_best


        

In [44]:
best_clf = param_hyperopt(train_LGBM)

{'bagging_fraction': 0.9429104308567877, 'bagging_freq': 2, 'feature_fraction': 0.5715782198140802, 'learning_rate': 0.21315219327595428, 'min_child_samples': 11, 'num_leaves': 160, 'reg_alpha': 3, 'reg_lambda': 7.561160634893758, 'feature_pre_filter': False, 'objective': 'regression', 'metric': 'rmse', 'bagging_seed': 2020}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.163333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73405                    
[LightGBM] [Info] Number of data points in the train set: 100958, number of used features: 300
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.161473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73405                    
[LightGBM] [Info] Number of data points in the train set: 100958, number of used features: 300
[LightGBM] [Info] Start training from score -0.39

In [45]:
best_clf

{'bagging_fraction': 0.8198879482271282,
 'bagging_freq': 8,
 'feature_fraction': 0.5135156001738832,
 'learning_rate': 0.014657097603624963,
 'min_child_samples': 3,
 'num_leaves': 6,
 'reg_alpha': 7,
 'reg_lambda': 2.7194579245643924}

## LightGBM predition

### Sigle model

In [46]:
best_clf = params_append(best_clf)

label = 'target'
features = train_LGBM.columns.tolist()
features.remove('card_id')
features.remove('target')

lgb_train = lgb.Dataset(train_LGBM[features], train_LGBM[label])

In [47]:
bst = lgb.train(best_clf, lgb_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.319097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73405
[LightGBM] [Info] Number of data points in the train set: 201917, number of used features: 300
[LightGBM] [Info] Start training from score -0.393636


In [48]:
bst.predict(train_LGBM[features])

array([-0.22620818, -1.25457104,  0.04399539, ..., -0.26112611,
       -1.06688101, -0.22620818])

Check RMSE

In [49]:
np.sqrt(mean_squared_error(train_LGBM[label], bst.predict(train_LGBM[features])))

3.731340064698996

In [50]:
test_LGBM['target'] = bst.predict(test_LGBM[features])
test_LGBM[['card_id', 'target']].to_csv("./result/submission_LGBM_single.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_LGBM['target'] = bst.predict(test_LGBM[features])


In [51]:
test_LGBM[['card_id', 'target']].head(5)

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.084191
1,C_ID_130fd0cbdd,-0.670433
2,C_ID_b709037bc5,-0.095542
3,C_ID_d27d835a9f,-0.221292
4,C_ID_2b5e3df5c2,-0.286112


### Cross validation

In [54]:
def train_predict(train, test, params):
    """
    :param train:
    :param test:
    :param params:
    :return:
    """
    
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')
    
    params = params_append(params)
    ESR = 30
    NBR = 10000
    VBE = 50
    
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series()
    
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)

    for train_part_index, eval_index in kf.split(train[features], train[label]):
        train_part = lgb.Dataset(train[features].loc[train_part_index],
                                 train[label].loc[train_part_index])
        eval = lgb.Dataset(train[features].loc[eval_index],
                           train[label].loc[eval_index])
        early_stopping_callback = lgb.early_stopping(20)
        bst = lgb.train(params, train_part, num_boost_round=NBR,
                        valid_sets=[train_part, eval],
                        valid_names=['train', 'valid'],
                        callbacks=[early_stopping_callback])
        # 测试集预测结果并纳入prediction_test容器
        prediction_test += bst.predict(test[features])
        # 验证集预测结果并纳入prediction_train容器
        prediction_train = prediction_train.append(pd.Series(bst.predict(train[features].loc[eval_index]),
                                                             index=eval_index))
        # 验证集预测结果
        eval_pre = bst.predict(train[features].loc[eval_index])
        # 计算验证集上得分
        score = np.sqrt(mean_squared_error(train[label].loc[eval_index].values, eval_pre))
        # 纳入cv_score容器
        cv_score.append(score)
        
    print(cv_score, sum(cv_score) / 5)
    pd.Series(prediction_train.sort_index().values).to_csv("./dataset/preprocess/train_lightgbm.csv", index=False)
    pd.Series(prediction_test / 5).to_csv("./dataset/preprocess/test_lightgbm.csv", index=False)
    test['target'] = prediction_test / 5
    test[['card_id', 'target']].to_csv("./result/submission_lightgbm_cv.csv", index=False)
    return

In [55]:
train_LGBM, test_LGBM = feature_select_wrapper(train, test)
best_clf = param_hyperopt(train_LGBM)
train_predict(train_LGBM, test_LGBM, best_clf)

feature_select_wrapper...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.711466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 227016
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 1626
[LightGBM] [Info] Start training from score -0.390986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.705355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 227122
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 1629
[LightGBM] [Info] Start training from score -0.396781
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.755185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 227089
[LightGBM] [Info] Number of data points in the train set: 161534, number of used

  prediction_train = pd.Series()


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.247098 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73388
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 300
[LightGBM] [Info] Start training from score -0.390986
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[752]	train's rmse: 3.63352	valid's rmse: 3.69068


  prediction_train = prediction_train.append(pd.Series(bst.predict(train[features].loc[eval_index]),


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.272231 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73391
[LightGBM] [Info] Number of data points in the train set: 161533, number of used features: 300
[LightGBM] [Info] Start training from score -0.396781
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[943]	train's rmse: 3.62733	valid's rmse: 3.64788


  prediction_train = prediction_train.append(pd.Series(bst.predict(train[features].loc[eval_index]),


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.270449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73390
[LightGBM] [Info] Number of data points in the train set: 161534, number of used features: 300
[LightGBM] [Info] Start training from score -0.390348
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1056]	train's rmse: 3.60329	valid's rmse: 3.70727


  prediction_train = prediction_train.append(pd.Series(bst.predict(train[features].loc[eval_index]),


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.242581 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73391
[LightGBM] [Info] Number of data points in the train set: 161534, number of used features: 300
[LightGBM] [Info] Start training from score -0.391392
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1206]	train's rmse: 3.57757	valid's rmse: 3.77831


  prediction_train = prediction_train.append(pd.Series(bst.predict(train[features].loc[eval_index]),


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.295397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73388
[LightGBM] [Info] Number of data points in the train set: 161534, number of used features: 300
[LightGBM] [Info] Start training from score -0.398675
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[646]	train's rmse: 3.66798	valid's rmse: 3.57939


  prediction_train = prediction_train.append(pd.Series(bst.predict(train[features].loc[eval_index]),


[3.690683620082564, 3.6478790166855783, 3.707267070867506, 3.7783064335534613, 3.5793879232664563] 3.6807048128911126


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['target'] = prediction_test / 5


# Single Model: NLP feature optimization, XGBoost, Bayes_opt

I found there were many columns related to ID, such as `merchant_id`, `merchant_category_id`, `state_id`, `subsector_id`, `city_id`, these values inflect customer's behavior. For example, for a customer A, if a certain merchant id happens a lot in his transaction record, it means A likes this merchant. Furthermore, if this merchant record in many customers', it means this merchant is popular, and it also means customer A is similar to other customers, if not, A has a special like. 

In order to mining this information, I will apply CountVector and TF-IDF. Specificly, CountVector can extract merchant information of a customer, and TF-IDF can extract if many customers like one product at the same time.

If we apply NLP approaches, there is an issue we should consider which is there are too many new features and most of them are sparse. So I'll introduce associated method `sparse` from `scipy`.

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse

In [59]:
# 注意，该数据集是最初始的数据集
train = pd.read_csv('./dataset/train.csv')
test =  pd.read_csv('./dataset/test.csv')
merchant = pd.read_csv('./dataset/merchants.csv')
new_transaction = pd.read_csv('./dataset/new_merchant_transactions.csv')
history_transaction = pd.read_csv('./dataset/historical_transactions.csv')
transaction = pd.concat([new_transaction, history_transaction], axis=0, ignore_index=True)
del new_transaction
del history_transaction
gc.collect()

2469

In [60]:
nlp_features = ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']

for co in nlp_features:
    print(co)
    transaction[co] = transaction[co].astype(str)
    temp = transaction[transaction['month_lag']>=0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_new']
    train = pd.merge(train, temp, how='left', on='card_id')
    test = pd.merge(test, temp, how='left', on='card_id')

    temp = transaction[transaction['month_lag']<0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_hist']
    train = pd.merge(train, temp, how='left', on='card_id')
    test = pd.merge(test, temp, how='left', on='card_id')

    temp = transaction.groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_all']
    train = pd.merge(train, temp, how='left', on='card_id').fillna("-1")
    test = pd.merge(test, temp, how='left', on='card_id').fillna("-1")

merchant_id
merchant_category_id
state_id
subsector_id
city_id


In [71]:
# 创建空DataFrame用于保存NLP特征
train_x = pd.DataFrame()
test_x = pd.DataFrame()

# 实例化CountVectorizer评估器与TfidfVectorizer评估器
cntv = CountVectorizer()
tfv = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)

# 创建空列表用户保存修正后的列名称
vector_feature =[]
for co in ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']:
    vector_feature.extend([co+'_new', co+'_hist', co+'_all'])
    
# 提取每一列进行新特征衍生
for feature in vector_feature:
    print(feature)
    cntv.fit([feature] + test[feature])
    train_x = sparse.hstack((train_x, cntv.transform(train[feature]))).tocsr()
    test_x = sparse.hstack((test_x, cntv.transform(test[feature]))).tocsr()
    
    tfv.fit(train[feature].append(test[feature]))
    train_x = sparse.hstack((train_x, tfv.transform(train[feature]))).tocsr()
    test_x = sparse.hstack((test_x, tfv.transform(test[feature]))).tocsr()
    
# 保存NLP特征衍生结果
sparse.save_npz("./dataset/preprocess/train_nlp.npz", train_x)
sparse.save_npz("./dataset/preprocess/test_nlp.npz", test_x)

merchant_id_new


  tfv.fit(train[feature].append(test[feature]))


merchant_id_hist


  tfv.fit(train[feature].append(test[feature]))


merchant_id_all


  tfv.fit(train[feature].append(test[feature]))


merchant_category_id_new


  tfv.fit(train[feature].append(test[feature]))


merchant_category_id_hist


  tfv.fit(train[feature].append(test[feature]))


merchant_category_id_all


  tfv.fit(train[feature].append(test[feature]))


state_id_new


  tfv.fit(train[feature].append(test[feature]))


state_id_hist


  tfv.fit(train[feature].append(test[feature]))


state_id_all


  tfv.fit(train[feature].append(test[feature]))


subsector_id_new


  tfv.fit(train[feature].append(test[feature]))


subsector_id_hist


  tfv.fit(train[feature].append(test[feature]))


subsector_id_all


  tfv.fit(train[feature].append(test[feature]))


city_id_new


  tfv.fit(train[feature].append(test[feature]))


city_id_hist


  tfv.fit(train[feature].append(test[feature]))


city_id_all


  tfv.fit(train[feature].append(test[feature]))


In [72]:
train_x.shape

(201917, 4231688)