In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import time
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

In [8]:
# 合并训练数据和测试数据，并进行排序
data = pd.concat([test, train], axis=0, ignore_index=True)
data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)

data['target'] = data['target'].apply(lambda x:0 if x<0 else x)

In [4]:
del train
del test 

gc.collect()

22

In [6]:
import time

def timer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} took {end_time - start_time:.2f} seconds to execute.")
        return result
    return wrapper

In [7]:
print("dt feature")  
for day in [7,30,90,180,365]:
    data[f'sin_dt_{day}'] = np.sin(2*np.pi*data['dt']/day)
    data[f'cos_dt_{day}'] = np.cos(2*np.pi*data['dt']/day)
    
for gap in [2,4,7,15,30,60]:
    """
    https://zhuanlan.zhihu.com/p/696298733
    正弦和余弦来自单位圆，可以映射时间戳在这个圆上的位置，用正弦和余弦坐标表示。将圆圈的右侧视为起点(在下面的图表中以0表示)或真正的24小时时间刻度上的00:00 (12AM)，我们将其划分为4个6小时的地标，以便能够将小时映射到圆上。
    """
    for col in ['sin_dt_7','cos_dt_7','sin_dt_30','cos_dt_30','sin_dt_90','cos_dt_90','sin_dt_180','cos_dt_180','sin_dt_365','cos_dt_365']:
        data[f"{col}_shift{gap}"] = data.groupby(data['id'])[col].shift(gap)
        data[f"{col}_gap{gap}"] = data[col] - data[f"{col}_shift{gap}"]
        data.drop([f"{col}_shift{gap}"],axis=1,inplace=True)

data['dt_dayofweek'] = data['dt']%7
for d in range(7):
    data[f'dt_dayofweek_{d}'] = (data['dt_dayofweek']==d)  
data['dt_month'] = data['dt']//30%12
for d in range(12):
    data[f'dt_month_{d}'] = (data['dt_month']==d)
data['dt_year'] = data['dt']//365

dt feature


In [8]:
@timer
def get_shift_cols(data, start_day, end_day):
    """计算平移天数的target"""
    for i in range(10,30):
        data[f'last{i}_target'] = data.groupby(['id'])['target'].shift(i)
        data[f'last{i}_target_type'] = data.groupby(['type'])['target'].shift(i)
    return data

data = get_shift_cols(data, 10, 31)

get_shift_cols took 3.03 seconds to execute.


In [9]:
@timer
def get_diff_shift(data, col_names, shift_days, diff_days):
    """
    shift_days = [14]
    diff_days = [1,2,3]
    计算平移的target后再计算相应的差分
    """
    for col_ in tqdm(col_names):
        for s_days in shift_days:
            for d_days in diff_days:
                data['d_{}_{}_before_diff'.format(s_days, d_days)] = data.groupby(col_)['target'].shift(s_days).diff(d_days)
    return data

data = get_diff_shift(data, ['id', 'type'], [i for i in range(14, 30)], [i for i in range(1, 8)])

100%|██████████| 2/2 [00:17<00:00,  8.54s/it]

get_diff_shift took 17.12 seconds to execute.





In [10]:
@timer
def get_win_mean_target(data, start_day, end_day):
    col_list = ['last{}_target'.format(i) for i in range(start_day, end_day)]
    col_name = end_day - start_day
    data['win{}_mean_target'.format(col_name)] = data[col_list].mean(axis=1)
    data['win{}_median_target'.format(col_name)] = data[col_list].median(axis=1)
    data['win{}_min_target'.format(col_name)] = data[col_list].min(axis=1)
    data['win{}_max_target'.format(col_name)] = data[col_list].max(axis=1)
    return data

for i in tqdm(range(12, 18)):
    data = get_win_mean_target(data, 10, i)

 17%|█▋        | 1/6 [01:13<06:08, 73.64s/it]

get_win_mean_target took 73.63 seconds to execute.


 33%|███▎      | 2/6 [02:43<05:33, 83.46s/it]

get_win_mean_target took 90.31 seconds to execute.


 50%|█████     | 3/6 [04:20<04:27, 89.28s/it]

get_win_mean_target took 96.19 seconds to execute.


 67%|██████▋   | 4/6 [06:00<03:07, 93.59s/it]

get_win_mean_target took 100.18 seconds to execute.


 83%|████████▎ | 5/6 [07:45<01:37, 97.87s/it]

get_win_mean_target took 105.44 seconds to execute.


100%|██████████| 6/6 [09:27<00:00, 94.52s/it]

get_win_mean_target took 101.27 seconds to execute.





In [11]:
@timer
def get_groupby_col(data, col_name):
    data['target_mean_{}'.format(col_name)] = data.groupby(col_name)[['target']].transform(np.mean)
    data['target_median_{}'.format(col_name)] = data.groupby(col_name)[['target']].transform(np.median)
    data['target_max_{}'.format(col_name)] = data.groupby(col_name)[['target']].transform(np.max)
    data['target_min_{}'.format(col_name)] = data.groupby(col_name)[['target']].transform(np.min)
    data['target_var_{}'.format(col_name)] = data.groupby(col_name)[['target']].transform(np.var)
    data['target_std_{}'.format(col_name)] = data.groupby(col_name)[['target']].transform(np.std)
#     data["target_quantile_25_{}"] = data.groupby(col_name)[['target']].transform(np.quantile(q = 0.25))
#     data["target_quantile_75_{}"] = data.groupby(col_name)[['target']].transform(np.quantile(q = 0.75))
    return data
data = get_groupby_col(data, 'id')
data = get_groupby_col(data, 'type')

get_groupby_col took 154.42 seconds to execute.
get_groupby_col took 158.16 seconds to execute.


In [12]:
@timer
def get_groupby_col_by_days(data, col_name, days):
    data_target_mean = data[data['dt']<=days].groupby(col_name)['target'].mean().reset_index()
    data_target_median = data[data['dt']<=days].groupby(col_name)['target'].median().reset_index()
    data_target_max = data[data['dt']<=days].groupby(col_name)['target'].max().reset_index()
    data_target_min = data[data['dt']<=days].groupby(col_name)['target'].min().reset_index()
    data_target_var = data[data['dt']<=days].groupby(col_name)['target'].var().reset_index()
    data_target_std = data[data['dt']<=days].groupby(col_name)['target'].std().reset_index()

    data_target_mean.rename(columns={'target': 'target_mean_{}_{}'.format(days, col_name)}, inplace=True)
    data_target_median.rename(columns={'target': 'target_median_{}_{}'.format(days, col_name)}, inplace=True)
    data_target_max.rename(columns={'target': 'target_max_{}_{}'.format(days, col_name)}, inplace=True)
    data_target_min.rename(columns={'target': 'target_min_{}_{}'.format(days, col_name)}, inplace=True)
    data_target_var.rename(columns={'target': 'target_var_{}_{}'.format(days, col_name)}, inplace=True)
    data_target_std.rename(columns={'target': 'target_std_{}_{}'.format(days, col_name)}, inplace=True)

    data = data.merge(data_target_mean, on=[col_name], how='left')
    data = data.merge(data_target_median, on=[col_name], how='left')
    data = data.merge(data_target_max, on=[col_name], how='left')
    data = data.merge(data_target_min, on=[col_name], how='left')
    data = data.merge(data_target_var, on=[col_name], how='left')
    data = data.merge(data_target_std, on=[col_name], how='left')
    return data

data = get_groupby_col_by_days(data, 'id', 20)
data = get_groupby_col_by_days(data, 'id', 30)

get_groupby_col_by_days took 206.07 seconds to execute.
get_groupby_col_by_days took 203.66 seconds to execute.


In [13]:
@timer
def get_target_speed_rate(data, start_day, end_day):
    for num in range(start_day, end_day+1):
        a_df = data['last{}_target'.format(num)] 
        b_df = data['last{}_target'.format(num+1)]
        data['last_{}_{}_speed_rate'.format(num, num+1)] = (a_df - b_df) / (b_df + 0.0001)
    return data

data = get_target_speed_rate(data, 10, 17)

get_target_speed_rate took 0.28 seconds to execute.


In [14]:
label_encode = LabelEncoder() 
data['id_label'] = label_encode.fit_transform(data['id'])

In [15]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns

warnings.filterwarnings('ignore')

df_for_kmean = data[['id', 'type', 'target_mean_id', 'target_median_id', 'target_max_id', 'target_min_id', 'target_var_id', 'target_std_id']]
df_for_kmean.set_index('id', inplace=True)
df_for_kmean.drop_duplicates(inplace=True)

k_values = range(2, 10)
evaluation_k = []
evaluation_score = []

"""
Silhouette分数的取值范围在[-1, 1]之间，分数越接近1表示聚类效果越好
"""
# 遍历不同的K值
for k in tqdm(k_values):
    kmeans = KMeans(n_clusters=k, random_state=2024)
    cluster_labels = kmeans.fit_predict(df_for_kmean)
    
    # 计算Silhouette Score
    silhouette = silhouette_score(df_for_kmean, cluster_labels)
    # 将评估指标值保存下来
    evaluation_k.append(k)
    evaluation_score.append(silhouette)

best_k = 4
kmeans_best = KMeans(n_clusters=best_k, random_state=2024)
cluster_labels = kmeans_best.fit_predict(df_for_kmean)
df_for_kmean['kmeans_label'] = cluster_labels

data_kmeans_labels = kmeans_best.predict(data[['type', 'target_mean_id', 'target_median_id', 'target_max_id', 'target_min_id', 'target_var_id', 'target_std_id']])

100%|██████████| 8/8 [00:03<00:00,  2.24it/s]


In [16]:
data['kmeans_labels'] = data_kmeans_labels

data['target_mean_kmeanslabels'] = data.groupby('kmeans_labels')[['target']].transform(np.mean)
data['target_median_kmeanslabels'] = data.groupby('kmeans_labels')[['target']].transform(np.median)
data['target_max_kmeanslabels'] = data.groupby('kmeans_labels')[['target']].transform(np.max)
data['target_min_kmeanslabels'] = data.groupby('kmeans_labels')[['target']].transform(np.min)
data['target_var_kmeanslabels'] = data.groupby('kmeans_labels')[['target']].transform(np.var)
data['target_std_kmeanslabels'] = data.groupby('kmeans_labels')[['target']].transform(np.std)

for i in tqdm(range(11, 13)):
    for i1 in range(3, 8):
        data['d_{}_before_diff_rolling_mean_{}'.format(i, i1)] = data.groupby('id')['target'].shift(i).rolling(i1).mean()
        data['d_{}_before_diff_rolling_median_{}'.format(i, i1)] = data.groupby('id')['target'].shift(i).rolling(i1).median()
        data['d_{}_before_diff_rolling_max_{}'.format(i, i1)] = data.groupby('id')['target'].shift(i).rolling(i1).max()
        data['d_{}_before_diff_rolling_min_{}'.format(i, i1)] = data.groupby('id')['target'].shift(i).rolling(i1).min()

100%|██████████| 2/2 [00:16<00:00,  8.45s/it]


In [17]:
# 进行数据切分
data = data.loc[data['dt']<500]
train = data[data.target.notnull()].reset_index(drop=True)
test = data[data.target.isnull()].reset_index(drop=True)

train_cols = [f for f in train.columns if f not in ['id','target']]

In [19]:
del data
gc.collect()

In [20]:
# """
# 在原先衍生0715号的特征基础上又继续衍生后，导致线上的特征下滑，猜测可能特征上出现了过拟合的现象
# 在此进行相关的特征剔除，使用null importance进行特征过滤
# """
# from sklearn.metrics import mean_squared_error

# def get_feature_importances(data, shuffle):
#     # Gather real features
#     train_features = [f for f in data.columns if f not in ['id','target']]
#     # Go over fold and keep track of CV score (train and valid) and feature importances
    
#     data_copy = data.copy()

#     del data
#     gc.collect()

#     # Shuffle target if required
#     if shuffle:
#         # Here you could as well use a binomial distribution
#         y = data_copy['target'].copy().sample(frac=1.0)
#         data_copy['target'] = y.tolist()

#     imp_df = pd.DataFrame()

#     trn_x, trn_y = data_copy.loc[(data_copy['dt']>=31)][train_features], data_copy.loc[(data_copy['dt']>=31)]['target']
#     val_x, val_y = data_copy.loc[(data_copy['dt']<=30)][train_features], data_copy.loc[(data_copy['dt']<=30)]['target']
#     # 构建模型输入数据
#     train_matrix = lgb.Dataset(trn_x, label=trn_y)
#     valid_matrix = lgb.Dataset(val_x, label=val_y)

#     # lightgbm参数
#     lgb_params = {
#         'boosting_type': 'gbdt',
#         'objective': 'regression',
#         'metric': 'mse',
#         'min_child_weight': 5,
#         'num_leaves': 2 ** 5,
#         'lambda_l2': 10,
#         'feature_fraction': 0.8,
#         'bagging_fraction': 0.8,
#         'bagging_freq': 4,
#         'learning_rate': 0.05,
#         'seed': 2024,
#         'nthread' : 16,
#         'verbose' : -1,
#     }
#         # 训练模型
#     model_null_importance = lgb.train(lgb_params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], 
#                         callbacks=[lgb.early_stopping(stopping_rounds=500), lgb.log_evaluation(500)])
        
#     # Get feature importances
#     imp_df = pd.DataFrame()
#     imp_df["feature"] = list(train_features)
#     imp_df["importance_gain"] = model_null_importance.feature_importance(importance_type='gain')
#     imp_df["importance_split"] = model_null_importance.feature_importance(importance_type='split')
#     imp_df['trn_score'] = mean_squared_error(data_copy['target'].tolist(), model_null_importance.predict(data_copy[train_features]))
#     return imp_df

# use_df = train
# # null importance 变量
# null_imp_df = pd.DataFrame()
# # 运行次数50次
# nb_runs = 10
# # 标签数据
# label = 'target'
# # 真实响应变量下的特征重要度
# actual_imp_df = get_feature_importances(data=use_df, shuffle=False)

# import time
# start = time.time()

# dsp = ''
# # 运算Null importance
# for i in range(nb_runs):
#     # Get current run importances
#     imp_df = get_feature_importances(data=use_df, shuffle=True)
#     imp_df['run'] = i + 1 
#     # Concat the latest importances with the old ones
#     null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
#     # Erase previous message
#     for l in range(len(dsp)):
#         print('\b', end='', flush=True)
#     # Display current run and time used
#     spent = (time.time() - start) / 60
#     dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent)
#     print(dsp, end='', flush=True)

# feature_scores = []
# for _f in actual_imp_df['feature'].unique():
#     f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
#     f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].mean()
#     gain_score = np.log(1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, 75)))  # Avoid didvide by zero
#     f_null_imps_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
#     f_act_imps_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].mean()
#     split_score = np.log(1e-10 + f_act_imps_split / (1 + np.percentile(f_null_imps_split, 75)))  # Avoid didvide by zero
#     feature_scores.append((_f, split_score, gain_score))

# scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score'])

# feats = scores_df['feature'][(scores_df['split_score']>0) & (scores_df['gain_score']>0)].tolist()

# """
# 所有特征都可以满足
# """

# 参数优化

In [21]:
# import optuna
# from optuna.samplers import TPESampler
# from sklearn.metrics import mean_squared_error

# class OptunaSingel:

#     def __init__(self, state, ntrials, metric_obj, data, model_type, cols):
#         self.random_state = state
#         self.n_trials = ntrials
#         self.best_weight = None
#         self.direction = metric_obj
#         self.data = data
#         self.model_type = model_type
#         self.cols = cols

#     def _objective(self, trial):
#         score_list = []
#         if self.model_type == 'lgb':
#             lgb_params = {
#                 'boosting_type': 'gbdt',
#                 'objective': 'regression',
#                 'metric': 'mse',
#                 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#                 'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#                 'lambda_l2': trial.suggest_int('lambda_l2', 1, 20),
#                 'feature_fraction': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
#                 'bagging_fraction': trial.suggest_uniform('subsample', 0, 1.0),
#                 'bagging_freq': trial.suggest_int('subsample_freq', 1, 10),
#                 'learning_rate': trial.suggest_discrete_uniform('learning_rate',0.0,1.0,0.01),
#                 'seed': 2024,
#                 'verbose' : -1,
#             }
            
#         start_time = 20
#         end_time = 30
#         gap_time = 10
#         for i in range(start_time, end_time, gap_time):
#         # 训练集和验证集切分
#             trn_x, trn_y = data.loc[(data['dt']>=i+1)][self.cols], data.loc[(data['dt']>=i+1)]['target']
#             val_x, val_y = data.loc[(data['dt']<=i)][self.cols], data.loc[(data['dt']<=i)]['target']
#             if self.model_type == 'lgb':
#                 train_matrix = lgb.Dataset(trn_x, label=trn_y)
#                 valid_matrix = lgb.Dataset(val_x, label=val_y)

#             model = lgb.train(lgb_params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix])
#             val_pred = model.predict(val_x, num_iteration=model.best_iteration)
#             score = mean_squared_error(val_y, val_pred)
#             score_list.append(score)
#         return np.mean(score_list)

#     def fit(self):
#         self.study = optuna.create_study(direction=self.direction, sampler=TPESampler(seed=self.random_state))
#         self.study.optimize(self._objective, n_trials=self.n_trials)
#         self.best_weight = self.study.best_params

#     def weight(self):
#         return self.best_weight
    
# obj_opt = OptunaSingel(state=2024, 
#                        ntrials=500, 
#                        metric_obj='minimize', 
#                        data=train, 
#                        model_type='lgb', 
#                        cols=train_cols)
# obj_opt.fit()

In [22]:
%%time

from sklearn.metrics import mean_squared_error

fea_imp_dict = {}

def lgb_model(clf, train_df, test_df, cols):
    print('--------------------正在进行lgb模型训练--------------------')
    score_list = []
    test_oof = np.zeros(len(test_df))

    start_time = 20
    end_time = 50
    gap_time = 10

    for i in range(start_time, end_time, gap_time):
    # 训练集和验证集切分
        trn_x, trn_y = train_df.loc[(train_df['dt']>=i+1)][cols], train_df.loc[(train_df['dt']>=i+1)]['target']
        val_x, val_y = train_df.loc[(train_df['dt']<=i)][cols], train_df.loc[(train_df['dt']<=i)]['target']
        # 构建模型输入数据
        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)
        # lightgbm参数
        lgb_params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'mse',
            'min_child_weight': 5,
            'num_leaves': 2 ** 5,
            'lambda_l2': 10,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
            'learning_rate': 0.05,
            'seed': 2024,
            'nthread' : 16,
            'verbose' : -1,
        }
        # 训练模型
        model = clf.train(lgb_params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], 
                        categorical_feature=[], callbacks=[lgb.early_stopping(stopping_rounds=500), lgb.log_evaluation(500)])
        # 验证集和测试集结果预测
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_df[cols], num_iteration=model.best_iteration)
        fea_imp = dict(zip(model.feature_name(), model.feature_importance()))
        fea_imp_dict.setdefault(str(i),fea_imp)
        # 离线分数评估
        score = mean_squared_error(val_pred, val_y)
        score_list.append(score)
        test_oof += test_pred / ((end_time - start_time)/gap_time)
        print(score)
    print(score_list)
    print(np.mean(score_list))
    return val_pred, test_oof, model


def xgb_model(clf, train_df, test_df, cols):
    print('--------------------正在进行xg模型训练--------------------')
    score_list = []
    test_oof = np.zeros(len(test_df))

    start_time = 20
    end_time = 50
    gap_time = 10

    for i in range(start_time, end_time, gap_time):
        trn_x, trn_y = train_df.loc[(train_df['dt']>=i+1)][cols], train_df.loc[(train_df['dt']>=i+1)]['target']
        val_x, val_y = train_df.loc[(train_df['dt']<=i)][cols], train_df.loc[(train_df['dt']<=i)]['target']

        xgb_params = {
            'n_estimators': 30000,
            'learning_rate': 0.01,
            'booster': 'gbtree',
            # 'eval_metric': 'rmse',
            'subsample': 0.9,
            'colsample_bytree': 0.9,
            'min_child_weight': 10,
            'objective': 'reg:squarederror',
            'verbosity':0,
            'random_state': 2024,
            }

        def custom_mse(preds, dtrain):
            labels = dtrain.get_label()
            mse = mean_squared_error(labels, preds)
            return 'custom_mse', mse

        xgb_model = clf.XGBRegressor(**xgb_params)
        xgb_model.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], 
                    eval_metric=custom_mse, verbose=1000, early_stopping_rounds=100)

        # 验证集和测试集结果预测
        xgb_val_pred = xgb_model.predict(val_x)
        xgb_test_pred = xgb_model.predict(test_df[train_cols])

        # 离线分数评估
        score = mean_squared_error(xgb_val_pred, val_y)
        score_list.append(score)
        test_oof += xgb_test_pred / ((end_time - start_time)/gap_time)
        # 离线分数评估
        xgb_score = mean_squared_error(xgb_val_pred, val_y)
        print(xgb_score)

    print(score_list)
    print(np.mean(score_list))
    
    return xgb_val_pred, xgb_test_pred, xgb_model

lgb_oof, lgb_test, model_lgb = lgb_model(lgb, train, test, train_cols)
# xgb_oof, xgb_test, xgb_model = xgb_model(xgb, train, test, train_cols)

--------------------正在进行lgb模型训练--------------------
Training until validation scores don't improve for 500 rounds
[500]	training's l2: 129.19	valid_1's l2: 149.721
[1000]	training's l2: 114.432	valid_1's l2: 142.703
[1500]	training's l2: 105.409	valid_1's l2: 139.126
[2000]	training's l2: 99.2082	valid_1's l2: 137.028
[2500]	training's l2: 94.0693	valid_1's l2: 135.142
[3000]	training's l2: 90.106	valid_1's l2: 133.585
[3500]	training's l2: 86.8255	valid_1's l2: 132.312
[4000]	training's l2: 84.0066	valid_1's l2: 131.518
[4500]	training's l2: 81.5009	valid_1's l2: 130.511
[5000]	training's l2: 79.3018	valid_1's l2: 128.957
[5500]	training's l2: 77.29	valid_1's l2: 127.636
[6000]	training's l2: 75.5187	valid_1's l2: 127.505
[6500]	training's l2: 73.8015	valid_1's l2: 127.102
[7000]	training's l2: 72.2324	valid_1's l2: 126.383
[7500]	training's l2: 70.8188	valid_1's l2: 126.166
[8000]	training's l2: 69.544	valid_1's l2: 125.854
[8500]	training's l2: 68.3052	valid_1's l2: 125.463
[9000]	t

In [23]:
# 保存结果文件到本地
test['target'] = lgb_test
test['target'] = test['target'].apply(lambda x:0 if x<0 else x)
test[['id','dt','target']].to_csv('submit_0803_2_test.csv', index=None)