In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import gc
from tqdm import tqdm, tqdm_notebook
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score
from mlxtend.regressor import StackingCVRegressor
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
%matplotlib inline

## 读取数据&数据集说明
1. [训练集]历史销量数据：train_sales_data_v1.csv

|字段名称|	字段类型	|字段说明
|--|--|--|
|province	|String|	省份|
|adcode	int	省份编码
|model|	String|	车型编码|
|bodyType|	String|	车身类型|
|regYear|	int|	年|
|regMonth|	int|	月|
|salesVolume|	int|	销量|

2. [训练集]车型搜索数据：train_search_data_v1.csv

|字段名称|	字段类型	|字段说明
|--|--|--|
|province|	String|	省份|
|adcode|	int|	省份编码|
|model|	String|	车型编码|
|regYear|	int|	年|
|regMonth|	int|	月|
|popularity|	int|	搜索量|

3. [训练集]汽车垂直媒体新闻评论数据和车型评论数据：train_user_reply_data_v1.csv

该数据集包含了垂直媒体中，各车型的每月（不分地域）论坛发帖数据、每月新闻评论数据、车型下的评论数据三部分，这三个数据没有任何包含关系。

|字段名称|	字段类型	|字段说明
|--|--|--|
|model|	String|	车型编码|
|regYear|	int|	年|
|regMonth|	int|	月|
|newsReplyVolum|	int|	对车型相关新闻文章的评论数量|
|carCommentVolum|	int|	对车型的评价数量|

4. [评测集]2018年1月至4月的各车型各省份销量预测：evaluation_public.csv

|字段名称|	字段类型	|字段说明|
|--|--|--|
|id|	int|	数据的唯一标识，不可更改|
|province|	String|	省份|
|adcode|	int|	省份编码改|
|model|	String|	车型编码|
|bodyType|	String|	车身类型|
|regYear|	int|	年|
|regMonth|	int|	月|
|forecastVolum|	int|	预测销量，参赛队伍使用建立的模型得出的销量预测结果|

In [105]:
path  = './ccf_car/'

train_sales  = pd.read_csv(path+'train_sales_data.csv')#历史销量数据
train_search = pd.read_csv(path+'train_search_data.csv')#车型搜索数据
train_user   = pd.read_csv(path+'train_user_reply_data.csv')#汽车垂直媒体新闻评论数据和车型评论数据

evaluation_public = pd.read_csv(path+'evaluation_public.csv')#2018年1月至4月的各车型各省份销量预测
submit_example    = pd.read_csv(path+'submit_example.csv')

In [106]:
data = pd.concat([train_sales, evaluation_public], ignore_index=True)#合并训练集和测试集
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])#将搜索数据与销量数据融合
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])#将评论数据与销量数据、搜索数据融合
data['label'] = data['salesVolume']#训练集的销量->label
data['id'] = data['id'].fillna(0).astype(int)#训练集的数据没有id，全部补零（测试集有id，范围为0~5368），id是最后需要提交的两列数据之一（另一个是forecastVolum）
del data['salesVolume'], data['forecastVolum']#salesVolume->label,forecastVolum目前全为0没有意义，故删去
'''
num_feat = ['adcode', 'regMonth', 'regYear', 'popularity', 'carCommentVolum', 'newsReplyVolum']#number_feature数字特征
cate_feat = ['bodyType', 'model', 'province']#categlory_feature类别特征

for i in cate_feat:
    data[i] = data[i].astype('category')#都转化为类别类型
features = num_feat + cate_feat#所有特征=数字特征+类别特征
'''

"\nnum_feat = ['adcode', 'regMonth', 'regYear', 'popularity', 'carCommentVolum', 'newsReplyVolum']#number_feature数字特征\ncate_feat = ['bodyType', 'model', 'province']#categlory_feature类别特征\n\nfor i in cate_feat:\n    data[i] = data[i].astype('category')#都转化为类别类型\nfeatures = num_feat + cate_feat#所有特征=数字特征+类别特征\n"

In [107]:
# 给省份分类,依据https://www.daas-auto.com/newsDe/892.html
province1=['广东','江苏','山东','浙江','河南']
province2=['河北', '四川', '北京']
province3=['上海', '湖北', '湖南', '安徽']
province4=['辽宁', '云南', '陕西', '福建', '贵州','广西','山西','江西','重庆']
province5=['吉林', '黑龙江', '天津', '内蒙古', '新疆', '甘肃']
province6=['海南','宁夏','青海','西藏']

In [108]:
data['province_rank']='0'
for i in range(len(data)):
    if data['province'][i] in province1:
        data['province_rank'][i]='1'
    elif data['province'][i] in province2:
        data['province_rank'][i]='2'
    elif data['province'][i] in province3:
        data['province_rank'][i]='3'
    elif data['province'][i] in province4:
        data['province_rank'][i]='4'
    elif data['province'][i] in province5:
        data['province_rank'][i]='5'
    elif data['province'][i] in province6:
        data['province_rank'][i]='6'
    else:
        data['province_rank'][i]='0'
data.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,province_rank
0,310000,SUV,0,3c974920a76ac9c1,上海,1,2016,1479.0,11.0,106.0,292.0,3
1,530000,SUV,0,3c974920a76ac9c1,云南,1,2016,1594.0,11.0,106.0,466.0,4
2,150000,SUV,0,3c974920a76ac9c1,内蒙古,1,2016,1479.0,11.0,106.0,257.0,5
3,110000,SUV,0,3c974920a76ac9c1,北京,1,2016,2370.0,11.0,106.0,408.0,2
4,510000,SUV,0,3c974920a76ac9c1,四川,1,2016,3562.0,11.0,106.0,610.0,2


In [109]:
# 时间分类
data['date_block_num']=0
tmp=pd.array([0]*len(data))
cnts=[]
for year in data['regYear'].unique():
    for month in data[data['regYear']==year]['regMonth'].unique():
        cnts.append(str(year)+'_'+str(month))
for i in range(len(data)):
    data['date_block_num'][i]=cnts.index(str(data['regYear'][i])+'_'+str(data['regMonth'][i]))+1
data.sample(10)

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,province_rank,date_block_num
2123,360000,Sedan,0,a432c483b5beb856,江西,2,2016,458.0,103.0,1380.0,227.0,4,2
17368,320000,Sedan,0,7cf283430b3b5e38,江苏,2,2017,8710.0,590.0,3974.0,2401.0,1,14
17677,360000,Sedan,0,02aab221aabc03b9,江西,2,2017,267.0,483.0,2156.0,146.0,4,14
28281,360000,SUV,0,a207df29ec9583f0,江西,10,2017,1292.0,256.0,717.0,164.0,4,22
95,140000,MPV,0,17bc272c93f19d56,山西,1,2016,5520.0,0.0,0.0,2166.0,4,1
422,510000,SUV,0,b25c4e2e3856af22,四川,1,2016,5442.0,70.0,1177.0,1631.0,2,1
885,340000,MPV,0,7aab7fca2470987e,安徽,1,2016,214.0,6.0,2791.0,66.0,3,1
21280,370000,Sedan,0,7023efdab9cedc03,山东,5,2017,3610.0,173.0,5946.0,374.0,1,17
5401,360000,SUV,0,2a2ab41f8f6ff1cb,江西,5,2016,2093.0,54.0,3900.0,761.0,4,5
32578,210000,,899,7aab7fca2470987e,辽宁,1,2018,,,,,4,25


In [171]:
data.date_block_num.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)

# 初步结果

In [8]:
data.columns

Index(['adcode', 'bodyType', 'id', 'model', 'province', 'regMonth', 'regYear',
       'popularity', 'carCommentVolum', 'newsReplyVolum', 'label',
       'province_rank', 'date_block_num'],
      dtype='object')

In [9]:
cate_feat=['adcode','bodyType', 'model', 'province','province_rank']
#data['province_rank'].astype('int')
#print(1)
for i in cate_feat:
    data[i] = data[i].astype('category')#都转化为类别类型

In [10]:
X_train = data[data.date_block_num <= 20].drop(['label'], axis=1)
Y_train = data[data.date_block_num <= 20]['label']
X_valid = data[(data.date_block_num > 20) & (data.date_block_num <25)].drop(['label'], axis=1)
Y_valid = data[(data.date_block_num > 20) & (data.date_block_num <25)]['label']
X_test = data[data.date_block_num >= 25].drop(['label'], axis=1)

In [129]:
ts = time.time()

model = lgb.LGBMRegressor(objective='regression', 
                    metric='rmse',
                    num_leaves=4,
                    learning_rate=0.05, 
                    n_estimators=5000,
                    max_bin=400, 
                    bagging_fraction=0.75,
                    bagging_freq=5, 
                    bagging_seed=7,
                    feature_fraction=0.2,
                    feature_fraction_seed=7,
                    verbose=-1
                    #min_data_in_leaf=2,
                    #min_sum_hessian_in_leaf=11
                         )

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    categorical_feature=cate_feat,
    verbose=True, 
    early_stopping_rounds = 50)

time.time() - ts

[1]	training's rmse: 1.02201	valid_1's rmse: 1.09981
Training until validation scores don't improve for 50 rounds.
[2]	training's rmse: 1.00567	valid_1's rmse: 1.08281
[3]	training's rmse: 0.990541	valid_1's rmse: 1.06755
[4]	training's rmse: 0.978822	valid_1's rmse: 1.05665
[5]	training's rmse: 0.96678	valid_1's rmse: 1.04533
[6]	training's rmse: 0.953135	valid_1's rmse: 1.03155
[7]	training's rmse: 0.947768	valid_1's rmse: 1.02625
[8]	training's rmse: 0.931204	valid_1's rmse: 1.00908
[9]	training's rmse: 0.914899	valid_1's rmse: 0.992673
[10]	training's rmse: 0.91031	valid_1's rmse: 0.988124
[11]	training's rmse: 0.899568	valid_1's rmse: 0.97718
[12]	training's rmse: 0.896003	valid_1's rmse: 0.975142
[13]	training's rmse: 0.883613	valid_1's rmse: 0.962281
[14]	training's rmse: 0.873918	valid_1's rmse: 0.952465
[15]	training's rmse: 0.870923	valid_1's rmse: 0.95038
[16]	training's rmse: 0.859624	valid_1's rmse: 0.938512
[17]	training's rmse: 0.85108	valid_1's rmse: 0.929143
[18]	train

[290]	training's rmse: 0.52267	valid_1's rmse: 0.596198
[291]	training's rmse: 0.522631	valid_1's rmse: 0.595912
[292]	training's rmse: 0.522391	valid_1's rmse: 0.59556
[293]	training's rmse: 0.522292	valid_1's rmse: 0.595442
[294]	training's rmse: 0.522258	valid_1's rmse: 0.595436
[295]	training's rmse: 0.522036	valid_1's rmse: 0.595103
[296]	training's rmse: 0.521838	valid_1's rmse: 0.594896
[297]	training's rmse: 0.521628	valid_1's rmse: 0.59457
[298]	training's rmse: 0.521598	valid_1's rmse: 0.594518
[299]	training's rmse: 0.521523	valid_1's rmse: 0.594388
[300]	training's rmse: 0.521328	valid_1's rmse: 0.594092
[301]	training's rmse: 0.521261	valid_1's rmse: 0.593954
[302]	training's rmse: 0.521228	valid_1's rmse: 0.593748
[303]	training's rmse: 0.521201	valid_1's rmse: 0.593734
[304]	training's rmse: 0.520973	valid_1's rmse: 0.593463
[305]	training's rmse: 0.520771	valid_1's rmse: 0.593279
[306]	training's rmse: 0.520694	valid_1's rmse: 0.593088
[307]	training's rmse: 0.520674	va

[578]	training's rmse: 0.491737	valid_1's rmse: 0.564771
[579]	training's rmse: 0.491617	valid_1's rmse: 0.564622
[580]	training's rmse: 0.491583	valid_1's rmse: 0.564649
[581]	training's rmse: 0.491507	valid_1's rmse: 0.564625
[582]	training's rmse: 0.491368	valid_1's rmse: 0.56443
[583]	training's rmse: 0.490842	valid_1's rmse: 0.563864
[584]	training's rmse: 0.490736	valid_1's rmse: 0.563723
[585]	training's rmse: 0.490691	valid_1's rmse: 0.56357
[586]	training's rmse: 0.490664	valid_1's rmse: 0.563539
[587]	training's rmse: 0.490634	valid_1's rmse: 0.56352
[588]	training's rmse: 0.490335	valid_1's rmse: 0.562746
[589]	training's rmse: 0.490086	valid_1's rmse: 0.562107
[590]	training's rmse: 0.489889	valid_1's rmse: 0.561879
[591]	training's rmse: 0.489872	valid_1's rmse: 0.561854
[592]	training's rmse: 0.489828	valid_1's rmse: 0.561901
[593]	training's rmse: 0.489813	valid_1's rmse: 0.561961
[594]	training's rmse: 0.489803	valid_1's rmse: 0.561984
[595]	training's rmse: 0.489793	va

[866]	training's rmse: 0.468803	valid_1's rmse: 0.547659
[867]	training's rmse: 0.468781	valid_1's rmse: 0.547659
[868]	training's rmse: 0.468747	valid_1's rmse: 0.547633
[869]	training's rmse: 0.468708	valid_1's rmse: 0.547642
[870]	training's rmse: 0.468665	valid_1's rmse: 0.547628
[871]	training's rmse: 0.4686	valid_1's rmse: 0.547667
[872]	training's rmse: 0.468538	valid_1's rmse: 0.547678
[873]	training's rmse: 0.46843	valid_1's rmse: 0.547681
[874]	training's rmse: 0.468402	valid_1's rmse: 0.547709
[875]	training's rmse: 0.468348	valid_1's rmse: 0.547717
[876]	training's rmse: 0.468329	valid_1's rmse: 0.547625
[877]	training's rmse: 0.468313	valid_1's rmse: 0.547602
[878]	training's rmse: 0.468291	valid_1's rmse: 0.547567
[879]	training's rmse: 0.468277	valid_1's rmse: 0.547529
[880]	training's rmse: 0.468266	valid_1's rmse: 0.547502
[881]	training's rmse: 0.468184	valid_1's rmse: 0.547449
[882]	training's rmse: 0.468169	valid_1's rmse: 0.547533
[883]	training's rmse: 0.468123	va

[1151]	training's rmse: 0.451906	valid_1's rmse: 0.535898
[1152]	training's rmse: 0.451889	valid_1's rmse: 0.535873
[1153]	training's rmse: 0.451874	valid_1's rmse: 0.535564
[1154]	training's rmse: 0.451788	valid_1's rmse: 0.53543
[1155]	training's rmse: 0.451762	valid_1's rmse: 0.535379
[1156]	training's rmse: 0.451742	valid_1's rmse: 0.535315
[1157]	training's rmse: 0.451432	valid_1's rmse: 0.535107
[1158]	training's rmse: 0.451412	valid_1's rmse: 0.535095
[1159]	training's rmse: 0.451399	valid_1's rmse: 0.535083
[1160]	training's rmse: 0.451373	valid_1's rmse: 0.535031
[1161]	training's rmse: 0.45137	valid_1's rmse: 0.535002
[1162]	training's rmse: 0.45108	valid_1's rmse: 0.534881
[1163]	training's rmse: 0.451073	valid_1's rmse: 0.534858
[1164]	training's rmse: 0.450679	valid_1's rmse: 0.534539
[1165]	training's rmse: 0.45042	valid_1's rmse: 0.534434
[1166]	training's rmse: 0.45036	valid_1's rmse: 0.534366
[1167]	training's rmse: 0.450346	valid_1's rmse: 0.534363
[1168]	training's r

[1293]	training's rmse: 0.44535	valid_1's rmse: 0.53169
[1294]	training's rmse: 0.445346	valid_1's rmse: 0.531752
[1295]	training's rmse: 0.445313	valid_1's rmse: 0.531766
[1296]	training's rmse: 0.445303	valid_1's rmse: 0.531704
[1297]	training's rmse: 0.4453	valid_1's rmse: 0.531707
[1298]	training's rmse: 0.445214	valid_1's rmse: 0.531635
[1299]	training's rmse: 0.445161	valid_1's rmse: 0.531633
[1300]	training's rmse: 0.445158	valid_1's rmse: 0.531635
[1301]	training's rmse: 0.445121	valid_1's rmse: 0.53142
[1302]	training's rmse: 0.445119	valid_1's rmse: 0.531435
[1303]	training's rmse: 0.4451	valid_1's rmse: 0.531449
[1304]	training's rmse: 0.444901	valid_1's rmse: 0.531347
[1305]	training's rmse: 0.44489	valid_1's rmse: 0.531344
[1306]	training's rmse: 0.444827	valid_1's rmse: 0.531268
[1307]	training's rmse: 0.444771	valid_1's rmse: 0.531199
[1308]	training's rmse: 0.444757	valid_1's rmse: 0.531201
[1309]	training's rmse: 0.444519	valid_1's rmse: 0.530924
[1310]	training's rmse

[1575]	training's rmse: 0.432669	valid_1's rmse: 0.525001
[1576]	training's rmse: 0.432653	valid_1's rmse: 0.524975
[1577]	training's rmse: 0.432636	valid_1's rmse: 0.524943
[1578]	training's rmse: 0.432571	valid_1's rmse: 0.524829
[1579]	training's rmse: 0.432542	valid_1's rmse: 0.524808
[1580]	training's rmse: 0.432049	valid_1's rmse: 0.524371
[1581]	training's rmse: 0.432044	valid_1's rmse: 0.524382
[1582]	training's rmse: 0.432012	valid_1's rmse: 0.524342
[1583]	training's rmse: 0.431968	valid_1's rmse: 0.524376
[1584]	training's rmse: 0.431959	valid_1's rmse: 0.524378
[1585]	training's rmse: 0.431804	valid_1's rmse: 0.52429
[1586]	training's rmse: 0.431778	valid_1's rmse: 0.524289
[1587]	training's rmse: 0.431778	valid_1's rmse: 0.524334
[1588]	training's rmse: 0.431776	valid_1's rmse: 0.52438
[1589]	training's rmse: 0.431731	valid_1's rmse: 0.524436
[1590]	training's rmse: 0.431722	valid_1's rmse: 0.524389
[1591]	training's rmse: 0.431672	valid_1's rmse: 0.524343
[1592]	training'

[1857]	training's rmse: 0.420158	valid_1's rmse: 0.516234
[1858]	training's rmse: 0.420139	valid_1's rmse: 0.51621
[1859]	training's rmse: 0.420113	valid_1's rmse: 0.516235
[1860]	training's rmse: 0.4201	valid_1's rmse: 0.51624
[1861]	training's rmse: 0.420099	valid_1's rmse: 0.516271
[1862]	training's rmse: 0.420091	valid_1's rmse: 0.516257
[1863]	training's rmse: 0.420091	valid_1's rmse: 0.516231
[1864]	training's rmse: 0.419928	valid_1's rmse: 0.516045
[1865]	training's rmse: 0.419906	valid_1's rmse: 0.516081
[1866]	training's rmse: 0.419887	valid_1's rmse: 0.516095
[1867]	training's rmse: 0.419839	valid_1's rmse: 0.516107
[1868]	training's rmse: 0.419836	valid_1's rmse: 0.516103
[1869]	training's rmse: 0.419832	valid_1's rmse: 0.51607
[1870]	training's rmse: 0.41974	valid_1's rmse: 0.516084
[1871]	training's rmse: 0.419739	valid_1's rmse: 0.516084
[1872]	training's rmse: 0.419739	valid_1's rmse: 0.516085
[1873]	training's rmse: 0.419729	valid_1's rmse: 0.516129
[1874]	training's rm

[2139]	training's rmse: 0.410707	valid_1's rmse: 0.512079
[2140]	training's rmse: 0.41069	valid_1's rmse: 0.512042
[2141]	training's rmse: 0.410688	valid_1's rmse: 0.512034
Early stopping, best iteration is:
[2091]	training's rmse: 0.41181	valid_1's rmse: 0.512012


4.696442365646362

In [130]:
Y_test = np.expm1(model.predict(X_test))

In [131]:
submission = pd.DataFrame({
    "id": submit_example['id'], 
    "forecastVolum": Y_test.round().astype(int)
})
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: 10 if x < 0 else x)#<0的结果变为0
submission.to_csv('rough_feature_engineer_log_replace.csv', index=False)

# 进一步构造特征

In [110]:
# adcode与省份一一对应，删去province
data.drop('province',axis=1,inplace=True)

In [111]:
data.head()

Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,province_rank,date_block_num
0,310000,SUV,0,3c974920a76ac9c1,1,2016,1479.0,11.0,106.0,292.0,3,1
1,530000,SUV,0,3c974920a76ac9c1,1,2016,1594.0,11.0,106.0,466.0,4,1
2,150000,SUV,0,3c974920a76ac9c1,1,2016,1479.0,11.0,106.0,257.0,5,1
3,110000,SUV,0,3c974920a76ac9c1,1,2016,2370.0,11.0,106.0,408.0,2,1
4,510000,SUV,0,3c974920a76ac9c1,1,2016,3562.0,11.0,106.0,610.0,2,1


In [112]:
data.bodyType.unique()

array(['SUV', 'Sedan', 'MPV', 'Hatchback', nan], dtype=object)

In [113]:
SUV_model=data[data.bodyType=='SUV'].model.unique()

In [114]:
Sedan_model=data[data.bodyType=='Sedan'].model.unique()

In [115]:
MPV_model=data[data.bodyType=='MPV'].model.unique()

In [116]:
Hatchback_model=data[data.bodyType=='Hatchback'].model.unique()

上述型号没有交集，将测试集中的NaN补上，逆转上述字典，得到model->bodyType

In [117]:
model2body=dict([(x,'SUV') for x in SUV_model])
model2body=dict(dict([(x,'Sedan') for x in Sedan_model]),**model2body)
model2body=dict(dict([(x,'MPV') for x in MPV_model]),**model2body)
model2body=dict(dict([(x,'Hatchback') for x in Hatchback_model]),**model2body)

In [118]:
for i in data.index:
    data.loc[i,'bodyType']=model2body.get(data.iloc[i]['model'])

In [119]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','adcode','model',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','adcode','model', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','adcode','model'], how='left')
    return df

In [120]:
data.sample()

Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,province_rank,date_block_num
36224,130000,Hatchback,4611,b4be3a4917289c82,4,2018,,,,,2,28


In [121]:
data.to_csv('data_before_mean.csv',index=False)

求均值

In [148]:
ts = time.time()
data = lag_feature(data, [4,5,6], 'label')
for i in data[data.id!=0].index:
    data.loc[i,'label']=(data.loc[i,'label_lag_4']*0.6+data.loc[i,'label_lag_5']*0.3+data.loc[i,'label_lag_6']*0.1)
data = lag_feature(data, [1,2,3], 'label')
time.time() - ts

2.492337226867676

In [149]:
# popularity
ts = time.time()
data = lag_feature(data, [4,5,6], 'popularity')
for i in data[data.id!=0].index:
    data.loc[i,'popularity']=(data.loc[i,'popularity_lag_4']*0.6+data.loc[i,'popularity_lag_5']*0.3+data.loc[i,'popularity_lag_6']*0.1)
data = lag_feature(data, [1,2,3], 'popularity')
time.time() - ts

2.490342855453491

In [150]:
# carCommentVolum
ts = time.time()
data = lag_feature(data, [4,5,6], 'carCommentVolum')
for i in data[data.id!=0].index:
    data.loc[i,'carCommentVolum']=(data.loc[i,'carCommentVolum_lag_4']*0.6+data.loc[i,'carCommentVolum_lag_5']*0.3+data.loc[i,'carCommentVolum_lag_6']*0.1)
data = lag_feature(data, [1,2,3], 'carCommentVolum')
time.time() - ts

2.5361924171447754

In [157]:
# newsReplyVolum
ts = time.time()
# 填补测试集
data = lag_feature(data, [4,5,6], 'newsReplyVolum')
for i in data[data.id!=0].index:
    data.loc[i,'newsReplyVolum']=(data.loc[i,'newsReplyVolum_lag_4']*0.6+data.loc[i,'newsReplyVolum_lag_5']*0.3+data.loc[i,'newsReplyVolum_lag_6']*0.1)
# 在所有数据上取均值
data = lag_feature(data, [1,2,3], 'newsReplyVolum')
time.time() - ts

2.506273031234741

In [160]:
ts = time.time()
month1_index=data[data['date_block_num']==1].index
month2_index=data[data['date_block_num']==2].index
month3_index=data[data['date_block_num']==3].index
for i in month1_index:
    data.loc[i,'label_last_3']=data.loc[i,'label']
    data.loc[i,'popularity_last_3']=data.loc[i,'popularity']
    data.loc[i,'carCommentVolum_last_3']=data.loc[i,'carCommentVolum']
    data.loc[i,'newsReplyVolum_last_3']=data.loc[i,'newsReplyVolum']
for i in month2_index:
    data.loc[i,'label_last_3']=(data.loc[i,'label']+data.loc[i,'label_lag_1'])/2.0
    data.loc[i,'popularity_last_3']=(data.loc[i,'popularity']+data.loc[i,'popularity_lag_1'])/2.0
    data.loc[i,'carCommentVolum_last_3']=(data.loc[i,'carCommentVolum']+data.loc[i,'carCommentVolum_lag_1'])/2.0
    data.loc[i,'newsReplyVolum_last_3']=(data.loc[i,'newsReplyVolum']+data.loc[i,'newsReplyVolum_lag_1'])/2.0
for i in month2_index:
    data.loc[i,'label_last_3']=(data.loc[i,'label']+data.loc[i,'label_lag_1']+data.loc[i,'label_lag_2'])/3.0
    data.loc[i,'popularity_last_3']=(data.loc[i,'popularity']+data.loc[i,'popularity_lag_1']+data.loc[i,'popularity_lag_2'])/3.0
    data.loc[i,'carCommentVolum_last_3']=(data.loc[i,'carCommentVolum']+data.loc[i,'carCommentVolum_lag_1']+data.loc[i,'carCommentVolum_lag_2'])/3.0
    data.loc[i,'newsReplyVolum_last_3']=(data.loc[i,'newsReplyVolum']+data.loc[i,'newsReplyVolum_lag_1']+data.loc[i,'newsReplyVolum_lag_2'])/3.0
other_index=data[(data.date_block_num!=1)&(data.date_block_num!=2)&(data.date_block_num!=3)].index
for i in other_index:
    data.loc[i,'label_last_3']=(data.loc[i,'label_lag_3']+data.loc[i,'label_lag_1']+data.loc[i,'label_lag_2'])/3.0
    data.loc[i,'popularity_last_3']=(data.loc[i,'popularity_lag_3']+data.loc[i,'popularity_lag_1']+data.loc[i,'popularity_lag_2'])/3.0
    data.loc[i,'carCommentVolum_last_3']=(data.loc[i,'carCommentVolum_lag_3']+data.loc[i,'carCommentVolum_lag_1']+data.loc[i,'carCommentVolum_lag_2'])/3.0
    data.loc[i,'newsReplyVolum_last_3']=(data.loc[i,'newsReplyVolum_lag_3']+data.loc[i,'newsReplyVolum_lag_1']+data.loc[i,'newsReplyVolum_lag_2'])/3.0
time.time() - ts

65.75620937347412

In [164]:
data.drop(['label_lag_4', 'label_lag_5',
       'label_lag_6', 'label_lag_1', 'label_lag_2', 'label_lag_3',
       'popularity_lag_4', 'popularity_lag_5', 'popularity_lag_6',
       'popularity_lag_1', 'popularity_lag_2', 'popularity_lag_3',
       'carCommentVolum_lag_4', 'carCommentVolum_lag_5',
       'carCommentVolum_lag_6', 'carCommentVolum_lag_1',
       'carCommentVolum_lag_2', 'carCommentVolum_lag_3',
       'newsReplyVolum_lag_4', 'newsReplyVolum_lag_5', 'newsReplyVolum_lag_6',
       'newsReplyVolum_lag_1', 'newsReplyVolum_lag_2', 'newsReplyVolum_lag_3'],axis=1,inplace=True)

In [165]:
data.sample(30)

Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,province_rank,date_block_num,label_last_3,popularity_last_3,carCommentVolum_last_3,newsReplyVolum_last_3
18643,450000,Sedan,0,7023efdab9cedc03,3,2017,770.0,257.0,173.0,47.0,4,15,77.0,723.333333,187.666667,1872.0
16599,360000,Sedan,0,dff803b4024d261d,1,2017,217.0,263.0,65.0,264.0,4,13,214.0,200.666667,274.0,405.0
10586,510000,Sedan,0,3d7554f1f56dd664,9,2016,5205.0,198.0,4708.0,1227.0,2,9,699.333333,3816.333333,130.666667,4474.666667
24445,110000,MPV,0,79de4e4b24c35b04,7,2017,3295.0,148.0,4037.0,116.0,2,19,129.0,3450.666667,277.333333,2433.0
25590,510000,Sedan,0,02aab221aabc03b9,8,2017,4011.0,843.0,2982.0,576.0,2,20,589.666667,1707.333333,496.666667,1085.666667
16325,530000,SUV,0,346393c2c6305fb1,1,2017,177.0,86.0,695.0,95.0,4,13,147.333333,199.333333,105.0,31.333333
22649,360000,Sedan,0,7cf283430b3b5e38,6,2017,2584.0,1181.0,2898.0,1662.0,4,18,1741.666667,2688.0,967.0,2101.0
26025,230000,Sedan,0,97f15de12cfabbd5,8,2017,2137.0,640.0,2697.0,522.0,5,20,429.0,1903.666667,415.666667,1697.333333
15095,110000,Hatchback,0,b4be3a4917289c82,12,2016,6803.0,76.0,125.0,226.0,2,12,220.666667,6630.0,59.333333,2377.333333
5793,140000,Sedan,0,02aab221aabc03b9,5,2016,152.0,283.0,933.0,114.0,4,5,87.666667,154.333333,253.333333,2554.0


# 训练

In [168]:
cate_feat=['adcode','bodyType', 'model','province_rank']
#data['province_rank'].astype('int')
#print(1)
for i in cate_feat:
    data[i] = data[i].astype('category')#都转化为类别类型

In [169]:
X_train = data[data.date_block_num <= 20].drop(['label'], axis=1)
Y_train = data[data.date_block_num <= 20]['label']
X_valid = data[(data.date_block_num > 20) & (data.date_block_num <25)].drop(['label'], axis=1)
Y_valid = data[(data.date_block_num > 20) & (data.date_block_num <25)]['label']
X_test = data[data.date_block_num >= 25].drop(['label'], axis=1)

In [170]:
ts = time.time()

model = lgb.LGBMRegressor(objective='regression', 
                    metric='rmse',
                    num_leaves=4,
                    learning_rate=0.01, 
                    n_estimators=5000,
                    max_bin=400, 
                    bagging_fraction=0.75,
                    bagging_freq=5, 
                    bagging_seed=7,
                    feature_fraction=0.2,
                    feature_fraction_seed=7,
                    verbose=2
                    #min_data_in_leaf=2,
                    #min_sum_hessian_in_leaf=11
                         )

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    categorical_feature=cate_feat,
    verbose=True, 
    early_stopping_rounds = 50)

time.time() - ts

[1]	training's rmse: 745.725	valid_1's rmse: 921.781
Training until validation scores don't improve for 50 rounds.
[2]	training's rmse: 744.366	valid_1's rmse: 920.372
[3]	training's rmse: 744.088	valid_1's rmse: 920.083
[4]	training's rmse: 743.767	valid_1's rmse: 919.54
[5]	training's rmse: 738.576	valid_1's rmse: 913.706
[6]	training's rmse: 737.288	valid_1's rmse: 912.002
[7]	training's rmse: 735.617	valid_1's rmse: 910.41
[8]	training's rmse: 734.216	valid_1's rmse: 909.151
[9]	training's rmse: 733.951	valid_1's rmse: 908.959
[10]	training's rmse: 732.795	valid_1's rmse: 907.891
[11]	training's rmse: 730.956	valid_1's rmse: 906.035
[12]	training's rmse: 729.214	valid_1's rmse: 903.45
[13]	training's rmse: 727.536	valid_1's rmse: 901.662
[14]	training's rmse: 726.413	valid_1's rmse: 900.84
[15]	training's rmse: 724.762	valid_1's rmse: 899.079
[16]	training's rmse: 723.143	valid_1's rmse: 897.35
[17]	training's rmse: 722.535	valid_1's rmse: 896.924
[18]	training's rmse: 721.392	vali

[151]	training's rmse: 577.856	valid_1's rmse: 742.591
[152]	training's rmse: 577.159	valid_1's rmse: 741.799
[153]	training's rmse: 573.822	valid_1's rmse: 737.815
[154]	training's rmse: 573.628	valid_1's rmse: 737.709
[155]	training's rmse: 573.485	valid_1's rmse: 737.392
[156]	training's rmse: 573.3	valid_1's rmse: 737.291
[157]	training's rmse: 570.055	valid_1's rmse: 733.457
[158]	training's rmse: 569.963	valid_1's rmse: 733.561
[159]	training's rmse: 569.849	valid_1's rmse: 733.381
[160]	training's rmse: 569.667	valid_1's rmse: 733.28
[161]	training's rmse: 566.515	valid_1's rmse: 729.587
[162]	training's rmse: 566.335	valid_1's rmse: 729.492
[163]	training's rmse: 565.568	valid_1's rmse: 728.685
[164]	training's rmse: 565.352	valid_1's rmse: 728.506
[165]	training's rmse: 564.772	valid_1's rmse: 727.888
[166]	training's rmse: 564.077	valid_1's rmse: 727.127
[167]	training's rmse: 563.227	valid_1's rmse: 726.159
[168]	training's rmse: 563.136	valid_1's rmse: 726.201
[169]	trainin

[300]	training's rmse: 456.409	valid_1's rmse: 597.014
[301]	training's rmse: 456.284	valid_1's rmse: 597.055
[302]	training's rmse: 456.216	valid_1's rmse: 596.996
[303]	training's rmse: 456.135	valid_1's rmse: 596.899
[304]	training's rmse: 455.914	valid_1's rmse: 596.604
[305]	training's rmse: 455.759	valid_1's rmse: 596.402
[306]	training's rmse: 455.553	valid_1's rmse: 596.117
[307]	training's rmse: 455.13	valid_1's rmse: 595.599
[308]	training's rmse: 454.942	valid_1's rmse: 595.626
[309]	training's rmse: 454.718	valid_1's rmse: 595.373
[310]	training's rmse: 452.875	valid_1's rmse: 592.941
[311]	training's rmse: 452.754	valid_1's rmse: 592.982
[312]	training's rmse: 451.083	valid_1's rmse: 590.727
[313]	training's rmse: 451.016	valid_1's rmse: 590.811
[314]	training's rmse: 450.759	valid_1's rmse: 590.503
[315]	training's rmse: 450.364	valid_1's rmse: 590.023
[316]	training's rmse: 450.247	valid_1's rmse: 590.055
[317]	training's rmse: 449.82	valid_1's rmse: 589.224
[318]	traini

[449]	training's rmse: 409.281	valid_1's rmse: 534.712
[450]	training's rmse: 409.222	valid_1's rmse: 534.611
[451]	training's rmse: 409.136	valid_1's rmse: 534.386
[452]	training's rmse: 409.032	valid_1's rmse: 534.257
[453]	training's rmse: 408.957	valid_1's rmse: 534.123
[454]	training's rmse: 408.921	valid_1's rmse: 534.085
[455]	training's rmse: 408.894	valid_1's rmse: 534.049
[456]	training's rmse: 408.634	valid_1's rmse: 533.703
[457]	training's rmse: 408.584	valid_1's rmse: 533.763
[458]	training's rmse: 407.385	valid_1's rmse: 532.014
[459]	training's rmse: 407.132	valid_1's rmse: 531.679
[460]	training's rmse: 407.039	valid_1's rmse: 531.481
[461]	training's rmse: 406.964	valid_1's rmse: 531.369
[462]	training's rmse: 406.913	valid_1's rmse: 531.31
[463]	training's rmse: 406.804	valid_1's rmse: 531.154
[464]	training's rmse: 406.7	valid_1's rmse: 531.144
[465]	training's rmse: 406.643	valid_1's rmse: 531.223
[466]	training's rmse: 405.454	valid_1's rmse: 529.493
[467]	trainin

[598]	training's rmse: 375.048	valid_1's rmse: 484.532
[599]	training's rmse: 374.217	valid_1's rmse: 483.902
[600]	training's rmse: 374.193	valid_1's rmse: 483.879
[601]	training's rmse: 374.11	valid_1's rmse: 483.878
[602]	training's rmse: 374.075	valid_1's rmse: 483.958
[603]	training's rmse: 374.052	valid_1's rmse: 483.925
[604]	training's rmse: 374.025	valid_1's rmse: 483.863
[605]	training's rmse: 373.985	valid_1's rmse: 483.795
[606]	training's rmse: 373.945	valid_1's rmse: 483.676
[607]	training's rmse: 373.622	valid_1's rmse: 483.319
[608]	training's rmse: 373.556	valid_1's rmse: 483.297
[609]	training's rmse: 373.535	valid_1's rmse: 483.27
[610]	training's rmse: 373.484	valid_1's rmse: 483.194
[611]	training's rmse: 373.454	valid_1's rmse: 483.137
[612]	training's rmse: 373.299	valid_1's rmse: 482.954
[613]	training's rmse: 373.232	valid_1's rmse: 482.808
[614]	training's rmse: 373.17	valid_1's rmse: 482.79
[615]	training's rmse: 373.136	valid_1's rmse: 482.87
[616]	training'

[747]	training's rmse: 353.715	valid_1's rmse: 455.699
[748]	training's rmse: 353.686	valid_1's rmse: 455.617
[749]	training's rmse: 353.669	valid_1's rmse: 455.618
[750]	training's rmse: 353.6	valid_1's rmse: 455.596
[751]	training's rmse: 353.328	valid_1's rmse: 455.207
[752]	training's rmse: 353.312	valid_1's rmse: 455.182
[753]	training's rmse: 353.285	valid_1's rmse: 455.117
[754]	training's rmse: 352.712	valid_1's rmse: 454.175
[755]	training's rmse: 352.662	valid_1's rmse: 454.222
[756]	training's rmse: 352.628	valid_1's rmse: 454.16
[757]	training's rmse: 352.61	valid_1's rmse: 454.125
[758]	training's rmse: 352.583	valid_1's rmse: 454.166
[759]	training's rmse: 352.553	valid_1's rmse: 454.062
[760]	training's rmse: 352.541	valid_1's rmse: 454.062
[761]	training's rmse: 351.938	valid_1's rmse: 452.96
[762]	training's rmse: 351.914	valid_1's rmse: 452.893
[763]	training's rmse: 351.342	valid_1's rmse: 451.925
[764]	training's rmse: 350.781	valid_1's rmse: 450.912
[765]	training'

[896]	training's rmse: 337.465	valid_1's rmse: 431.685
[897]	training's rmse: 337.413	valid_1's rmse: 431.668
[898]	training's rmse: 337.025	valid_1's rmse: 430.906
[899]	training's rmse: 336.978	valid_1's rmse: 430.97
[900]	training's rmse: 336.955	valid_1's rmse: 430.948
[901]	training's rmse: 336.944	valid_1's rmse: 430.9
[902]	training's rmse: 336.925	valid_1's rmse: 430.901
[903]	training's rmse: 336.9	valid_1's rmse: 430.896
[904]	training's rmse: 336.836	valid_1's rmse: 430.821
[905]	training's rmse: 336.817	valid_1's rmse: 430.806
[906]	training's rmse: 336.796	valid_1's rmse: 430.8
[907]	training's rmse: 336.774	valid_1's rmse: 430.719
[908]	training's rmse: 336.744	valid_1's rmse: 430.623
[909]	training's rmse: 336.714	valid_1's rmse: 430.493
[910]	training's rmse: 336.192	valid_1's rmse: 429.381
[911]	training's rmse: 336.178	valid_1's rmse: 429.374
[912]	training's rmse: 335.781	valid_1's rmse: 428.585
[913]	training's rmse: 335.766	valid_1's rmse: 428.561
[914]	training's 

[1044]	training's rmse: 324.076	valid_1's rmse: 410.673
[1045]	training's rmse: 324.055	valid_1's rmse: 410.645
[1046]	training's rmse: 324.048	valid_1's rmse: 410.625
[1047]	training's rmse: 323.999	valid_1's rmse: 410.588
[1048]	training's rmse: 323.955	valid_1's rmse: 410.637
[1049]	training's rmse: 323.907	valid_1's rmse: 410.601
[1050]	training's rmse: 323.877	valid_1's rmse: 410.584
[1051]	training's rmse: 323.832	valid_1's rmse: 410.543
[1052]	training's rmse: 323.827	valid_1's rmse: 410.557
[1053]	training's rmse: 323.817	valid_1's rmse: 410.553
[1054]	training's rmse: 323.798	valid_1's rmse: 410.54
[1055]	training's rmse: 323.783	valid_1's rmse: 410.542
[1056]	training's rmse: 323.774	valid_1's rmse: 410.519
[1057]	training's rmse: 323.737	valid_1's rmse: 410.568
[1058]	training's rmse: 323.719	valid_1's rmse: 410.558
[1059]	training's rmse: 323.703	valid_1's rmse: 410.566
[1060]	training's rmse: 323.649	valid_1's rmse: 410.548
[1061]	training's rmse: 323.638	valid_1's rmse: 4

[1190]	training's rmse: 316.657	valid_1's rmse: 401.338
[1191]	training's rmse: 316.646	valid_1's rmse: 401.343
[1192]	training's rmse: 316.619	valid_1's rmse: 401.23
[1193]	training's rmse: 316.607	valid_1's rmse: 401.23
[1194]	training's rmse: 316.361	valid_1's rmse: 400.757
[1195]	training's rmse: 316.347	valid_1's rmse: 400.763
[1196]	training's rmse: 316.111	valid_1's rmse: 400.288
[1197]	training's rmse: 315.889	valid_1's rmse: 399.798
[1198]	training's rmse: 315.852	valid_1's rmse: 399.775
[1199]	training's rmse: 315.822	valid_1's rmse: 399.743
[1200]	training's rmse: 315.797	valid_1's rmse: 399.775
[1201]	training's rmse: 315.787	valid_1's rmse: 399.777
[1202]	training's rmse: 315.764	valid_1's rmse: 399.75
[1203]	training's rmse: 315.703	valid_1's rmse: 399.711
[1204]	training's rmse: 315.686	valid_1's rmse: 399.718
[1205]	training's rmse: 315.674	valid_1's rmse: 399.713
[1206]	training's rmse: 315.623	valid_1's rmse: 399.699
[1207]	training's rmse: 315.616	valid_1's rmse: 399

[1337]	training's rmse: 308.576	valid_1's rmse: 389.434
[1338]	training's rmse: 308.551	valid_1's rmse: 389.406
[1339]	training's rmse: 308.52	valid_1's rmse: 389.45
[1340]	training's rmse: 308.506	valid_1's rmse: 389.453
[1341]	training's rmse: 308.504	valid_1's rmse: 389.456
[1342]	training's rmse: 308.474	valid_1's rmse: 389.409
[1343]	training's rmse: 308.458	valid_1's rmse: 389.4
[1344]	training's rmse: 308.048	valid_1's rmse: 388.541
[1345]	training's rmse: 307.845	valid_1's rmse: 388.094
[1346]	training's rmse: 307.835	valid_1's rmse: 388.069
[1347]	training's rmse: 307.829	valid_1's rmse: 388.065
[1348]	training's rmse: 307.802	valid_1's rmse: 388.046
[1349]	training's rmse: 307.793	valid_1's rmse: 388.03
[1350]	training's rmse: 307.78	valid_1's rmse: 388.045
[1351]	training's rmse: 307.778	valid_1's rmse: 388.047
[1352]	training's rmse: 307.751	valid_1's rmse: 388.061
[1353]	training's rmse: 307.74	valid_1's rmse: 388.07
[1354]	training's rmse: 307.53	valid_1's rmse: 387.691
[

[1484]	training's rmse: 299.807	valid_1's rmse: 376.15
[1485]	training's rmse: 299.786	valid_1's rmse: 376.159
[1486]	training's rmse: 299.774	valid_1's rmse: 376.154
[1487]	training's rmse: 299.619	valid_1's rmse: 375.781
[1488]	training's rmse: 299.588	valid_1's rmse: 375.796
[1489]	training's rmse: 299.576	valid_1's rmse: 375.787
[1490]	training's rmse: 299.423	valid_1's rmse: 375.423
[1491]	training's rmse: 299.405	valid_1's rmse: 375.437
[1492]	training's rmse: 299.384	valid_1's rmse: 375.46
[1493]	training's rmse: 299.24	valid_1's rmse: 375.12
[1494]	training's rmse: 299.219	valid_1's rmse: 375.08
[1495]	training's rmse: 299.207	valid_1's rmse: 375.083
[1496]	training's rmse: 299.194	valid_1's rmse: 375.08
[1497]	training's rmse: 299.18	valid_1's rmse: 375.045
[1498]	training's rmse: 299.16	valid_1's rmse: 375.015
[1499]	training's rmse: 299.152	valid_1's rmse: 375.013
[1500]	training's rmse: 299.133	valid_1's rmse: 374.983
[1501]	training's rmse: 299.112	valid_1's rmse: 374.974


[1631]	training's rmse: 293.514	valid_1's rmse: 366.701
[1632]	training's rmse: 293.488	valid_1's rmse: 366.695
[1633]	training's rmse: 293.478	valid_1's rmse: 366.69
[1634]	training's rmse: 293.448	valid_1's rmse: 366.698
[1635]	training's rmse: 293.422	valid_1's rmse: 366.692
[1636]	training's rmse: 293.408	valid_1's rmse: 366.674
[1637]	training's rmse: 293.39	valid_1's rmse: 366.65
[1638]	training's rmse: 293.389	valid_1's rmse: 366.653
[1639]	training's rmse: 293.252	valid_1's rmse: 366.372
[1640]	training's rmse: 293.245	valid_1's rmse: 366.368
[1641]	training's rmse: 293.194	valid_1's rmse: 366.211
[1642]	training's rmse: 293.182	valid_1's rmse: 366.209
[1643]	training's rmse: 293.157	valid_1's rmse: 366.223
[1644]	training's rmse: 293.023	valid_1's rmse: 365.919
[1645]	training's rmse: 292.996	valid_1's rmse: 365.92
[1646]	training's rmse: 292.986	valid_1's rmse: 365.9
[1647]	training's rmse: 292.965	valid_1's rmse: 365.936
[1648]	training's rmse: 292.756	valid_1's rmse: 365.53

[1777]	training's rmse: 288.519	valid_1's rmse: 360.594
[1778]	training's rmse: 288.511	valid_1's rmse: 360.595
[1779]	training's rmse: 288.502	valid_1's rmse: 360.589
[1780]	training's rmse: 288.474	valid_1's rmse: 360.542
[1781]	training's rmse: 288.459	valid_1's rmse: 360.542
[1782]	training's rmse: 288.453	valid_1's rmse: 360.533
[1783]	training's rmse: 288.443	valid_1's rmse: 360.527
[1784]	training's rmse: 288.432	valid_1's rmse: 360.524
[1785]	training's rmse: 288.408	valid_1's rmse: 360.53
[1786]	training's rmse: 288.399	valid_1's rmse: 360.532
[1787]	training's rmse: 288.389	valid_1's rmse: 360.534
[1788]	training's rmse: 288.364	valid_1's rmse: 360.543
[1789]	training's rmse: 288.253	valid_1's rmse: 360.287
[1790]	training's rmse: 288.252	valid_1's rmse: 360.292
[1791]	training's rmse: 288.216	valid_1's rmse: 360.286
[1792]	training's rmse: 288.191	valid_1's rmse: 360.294
[1793]	training's rmse: 288.18	valid_1's rmse: 360.306
[1794]	training's rmse: 288.169	valid_1's rmse: 36

[1923]	training's rmse: 283.827	valid_1's rmse: 354.637
[1924]	training's rmse: 283.816	valid_1's rmse: 354.626
[1925]	training's rmse: 283.802	valid_1's rmse: 354.631
[1926]	training's rmse: 283.801	valid_1's rmse: 354.628
[1927]	training's rmse: 283.781	valid_1's rmse: 354.624
[1928]	training's rmse: 283.765	valid_1's rmse: 354.619
[1929]	training's rmse: 283.758	valid_1's rmse: 354.611
[1930]	training's rmse: 283.748	valid_1's rmse: 354.612
[1931]	training's rmse: 283.737	valid_1's rmse: 354.599
[1932]	training's rmse: 283.729	valid_1's rmse: 354.584
[1933]	training's rmse: 283.724	valid_1's rmse: 354.578
[1934]	training's rmse: 283.705	valid_1's rmse: 354.539
[1935]	training's rmse: 283.674	valid_1's rmse: 354.469
[1936]	training's rmse: 283.578	valid_1's rmse: 354.211
[1937]	training's rmse: 283.545	valid_1's rmse: 354.21
[1938]	training's rmse: 283.538	valid_1's rmse: 354.217
[1939]	training's rmse: 283.44	valid_1's rmse: 353.953
[1940]	training's rmse: 283.334	valid_1's rmse: 35

[2069]	training's rmse: 279.623	valid_1's rmse: 348.755
[2070]	training's rmse: 279.609	valid_1's rmse: 348.754
[2071]	training's rmse: 279.566	valid_1's rmse: 348.757
[2072]	training's rmse: 279.525	valid_1's rmse: 348.759
[2073]	training's rmse: 279.515	valid_1's rmse: 348.768
[2074]	training's rmse: 279.505	valid_1's rmse: 348.767
[2075]	training's rmse: 279.414	valid_1's rmse: 348.523
[2076]	training's rmse: 279.207	valid_1's rmse: 348.22
[2077]	training's rmse: 279.185	valid_1's rmse: 348.232
[2078]	training's rmse: 279.175	valid_1's rmse: 348.232
[2079]	training's rmse: 279.169	valid_1's rmse: 348.239
[2080]	training's rmse: 279.145	valid_1's rmse: 348.202
[2081]	training's rmse: 279.133	valid_1's rmse: 348.183
[2082]	training's rmse: 279.111	valid_1's rmse: 348.194
[2083]	training's rmse: 279.098	valid_1's rmse: 348.196
[2084]	training's rmse: 279.089	valid_1's rmse: 348.188
[2085]	training's rmse: 278.894	valid_1's rmse: 347.771
[2086]	training's rmse: 278.888	valid_1's rmse: 3

[2215]	training's rmse: 275.584	valid_1's rmse: 343.61
[2216]	training's rmse: 275.561	valid_1's rmse: 343.618
[2217]	training's rmse: 275.495	valid_1's rmse: 343.437
[2218]	training's rmse: 275.494	valid_1's rmse: 343.437
[2219]	training's rmse: 275.459	valid_1's rmse: 343.433
[2220]	training's rmse: 275.437	valid_1's rmse: 343.441
[2221]	training's rmse: 275.381	valid_1's rmse: 343.4
[2222]	training's rmse: 275.36	valid_1's rmse: 343.41
[2223]	training's rmse: 275.339	valid_1's rmse: 343.42
[2224]	training's rmse: 275.333	valid_1's rmse: 343.409
[2225]	training's rmse: 275.328	valid_1's rmse: 343.412
[2226]	training's rmse: 275.302	valid_1's rmse: 343.408
[2227]	training's rmse: 275.293	valid_1's rmse: 343.412
[2228]	training's rmse: 275.27	valid_1's rmse: 343.409
[2229]	training's rmse: 275.258	valid_1's rmse: 343.413
[2230]	training's rmse: 275.227	valid_1's rmse: 343.43
[2231]	training's rmse: 275.218	valid_1's rmse: 343.43
[2232]	training's rmse: 275.209	valid_1's rmse: 343.431
[

[2361]	training's rmse: 271.783	valid_1's rmse: 338.394
[2362]	training's rmse: 271.741	valid_1's rmse: 338.411
[2363]	training's rmse: 271.729	valid_1's rmse: 338.411
[2364]	training's rmse: 271.716	valid_1's rmse: 338.422
[2365]	training's rmse: 271.661	valid_1's rmse: 338.279
[2366]	training's rmse: 271.646	valid_1's rmse: 338.287
[2367]	training's rmse: 271.626	valid_1's rmse: 338.279
[2368]	training's rmse: 271.557	valid_1's rmse: 338.104
[2369]	training's rmse: 271.544	valid_1's rmse: 338.126
[2370]	training's rmse: 271.526	valid_1's rmse: 338.128
[2371]	training's rmse: 271.516	valid_1's rmse: 338.134
[2372]	training's rmse: 271.49	valid_1's rmse: 338.188
[2373]	training's rmse: 271.466	valid_1's rmse: 338.186
[2374]	training's rmse: 271.46	valid_1's rmse: 338.189
[2375]	training's rmse: 271.452	valid_1's rmse: 338.195
[2376]	training's rmse: 271.451	valid_1's rmse: 338.195
[2377]	training's rmse: 271.444	valid_1's rmse: 338.205
[2378]	training's rmse: 271.434	valid_1's rmse: 33

[2507]	training's rmse: 268.978	valid_1's rmse: 335.484
[2508]	training's rmse: 268.975	valid_1's rmse: 335.488
[2509]	training's rmse: 268.96	valid_1's rmse: 335.487
[2510]	training's rmse: 268.948	valid_1's rmse: 335.479
[2511]	training's rmse: 268.921	valid_1's rmse: 335.466
[2512]	training's rmse: 268.913	valid_1's rmse: 335.477
[2513]	training's rmse: 268.902	valid_1's rmse: 335.48
[2514]	training's rmse: 268.886	valid_1's rmse: 335.435
[2515]	training's rmse: 268.879	valid_1's rmse: 335.44
[2516]	training's rmse: 268.872	valid_1's rmse: 335.422
[2517]	training's rmse: 268.86	valid_1's rmse: 335.423
[2518]	training's rmse: 268.797	valid_1's rmse: 335.267
[2519]	training's rmse: 268.786	valid_1's rmse: 335.272
[2520]	training's rmse: 268.771	valid_1's rmse: 335.269
[2521]	training's rmse: 268.762	valid_1's rmse: 335.275
[2522]	training's rmse: 268.753	valid_1's rmse: 335.271
[2523]	training's rmse: 268.745	valid_1's rmse: 335.279
[2524]	training's rmse: 268.732	valid_1's rmse: 335.

[2653]	training's rmse: 265.939	valid_1's rmse: 331.224
[2654]	training's rmse: 265.926	valid_1's rmse: 331.214
[2655]	training's rmse: 265.921	valid_1's rmse: 331.213
[2656]	training's rmse: 265.905	valid_1's rmse: 331.221
[2657]	training's rmse: 265.899	valid_1's rmse: 331.225
[2658]	training's rmse: 265.889	valid_1's rmse: 331.209
[2659]	training's rmse: 265.881	valid_1's rmse: 331.213
[2660]	training's rmse: 265.871	valid_1's rmse: 331.23
[2661]	training's rmse: 265.861	valid_1's rmse: 331.231
[2662]	training's rmse: 265.86	valid_1's rmse: 331.231
[2663]	training's rmse: 265.842	valid_1's rmse: 331.24
[2664]	training's rmse: 265.829	valid_1's rmse: 331.247
[2665]	training's rmse: 265.822	valid_1's rmse: 331.248
[2666]	training's rmse: 265.809	valid_1's rmse: 331.251
[2667]	training's rmse: 265.804	valid_1's rmse: 331.246
[2668]	training's rmse: 265.797	valid_1's rmse: 331.249
[2669]	training's rmse: 265.74	valid_1's rmse: 331.096
[2670]	training's rmse: 265.735	valid_1's rmse: 331.

[2799]	training's rmse: 263.279	valid_1's rmse: 328.347
[2800]	training's rmse: 263.273	valid_1's rmse: 328.346
[2801]	training's rmse: 263.231	valid_1's rmse: 328.338
[2802]	training's rmse: 263.189	valid_1's rmse: 328.243
[2803]	training's rmse: 263.182	valid_1's rmse: 328.245
[2804]	training's rmse: 263.175	valid_1's rmse: 328.248
[2805]	training's rmse: 263.161	valid_1's rmse: 328.206
[2806]	training's rmse: 263.055	valid_1's rmse: 328.049
[2807]	training's rmse: 263.045	valid_1's rmse: 328.045
[2808]	training's rmse: 263.03	valid_1's rmse: 328.06
[2809]	training's rmse: 263.025	valid_1's rmse: 328.064
[2810]	training's rmse: 263.02	valid_1's rmse: 328.067
[2811]	training's rmse: 263.012	valid_1's rmse: 328.074
[2812]	training's rmse: 263.004	valid_1's rmse: 328.08
[2813]	training's rmse: 262.995	valid_1's rmse: 328.086
[2814]	training's rmse: 262.99	valid_1's rmse: 328.102
[2815]	training's rmse: 262.982	valid_1's rmse: 328.108
[2816]	training's rmse: 262.971	valid_1's rmse: 328.0

[2945]	training's rmse: 260.332	valid_1's rmse: 324.931
[2946]	training's rmse: 260.326	valid_1's rmse: 324.926
[2947]	training's rmse: 260.319	valid_1's rmse: 324.935
[2948]	training's rmse: 260.315	valid_1's rmse: 324.937
[2949]	training's rmse: 260.308	valid_1's rmse: 324.913
[2950]	training's rmse: 260.295	valid_1's rmse: 324.927
[2951]	training's rmse: 260.286	valid_1's rmse: 324.926
[2952]	training's rmse: 260.245	valid_1's rmse: 324.802
[2953]	training's rmse: 260.236	valid_1's rmse: 324.8
[2954]	training's rmse: 260.226	valid_1's rmse: 324.803
[2955]	training's rmse: 260.22	valid_1's rmse: 324.802
[2956]	training's rmse: 260.215	valid_1's rmse: 324.813
[2957]	training's rmse: 260.117	valid_1's rmse: 324.646
[2958]	training's rmse: 260.109	valid_1's rmse: 324.648
[2959]	training's rmse: 260.102	valid_1's rmse: 324.645
[2960]	training's rmse: 260.097	valid_1's rmse: 324.656
[2961]	training's rmse: 260.09	valid_1's rmse: 324.659
[2962]	training's rmse: 259.945	valid_1's rmse: 324.

[3092]	training's rmse: 258.054	valid_1's rmse: 322.659
[3093]	training's rmse: 258.046	valid_1's rmse: 322.68
[3094]	training's rmse: 258.038	valid_1's rmse: 322.668
[3095]	training's rmse: 258.033	valid_1's rmse: 322.671
[3096]	training's rmse: 257.902	valid_1's rmse: 322.376
[3097]	training's rmse: 257.888	valid_1's rmse: 322.38
[3098]	training's rmse: 257.88	valid_1's rmse: 322.397
[3099]	training's rmse: 257.871	valid_1's rmse: 322.404
[3100]	training's rmse: 257.867	valid_1's rmse: 322.409
[3101]	training's rmse: 257.848	valid_1's rmse: 322.409
[3102]	training's rmse: 257.846	valid_1's rmse: 322.408
[3103]	training's rmse: 257.803	valid_1's rmse: 322.272
[3104]	training's rmse: 257.801	valid_1's rmse: 322.272
[3105]	training's rmse: 257.792	valid_1's rmse: 322.276
[3106]	training's rmse: 257.783	valid_1's rmse: 322.281
[3107]	training's rmse: 257.778	valid_1's rmse: 322.294
[3108]	training's rmse: 257.74	valid_1's rmse: 322.194
[3109]	training's rmse: 257.734	valid_1's rmse: 322.

[3238]	training's rmse: 255.577	valid_1's rmse: 319.805
[3239]	training's rmse: 255.573	valid_1's rmse: 319.803
[3240]	training's rmse: 255.567	valid_1's rmse: 319.8
[3241]	training's rmse: 255.563	valid_1's rmse: 319.8
[3242]	training's rmse: 255.559	valid_1's rmse: 319.802
[3243]	training's rmse: 255.554	valid_1's rmse: 319.807
[3244]	training's rmse: 255.548	valid_1's rmse: 319.815
[3245]	training's rmse: 255.545	valid_1's rmse: 319.824
[3246]	training's rmse: 255.54	valid_1's rmse: 319.823
[3247]	training's rmse: 255.526	valid_1's rmse: 319.845
[3248]	training's rmse: 255.525	valid_1's rmse: 319.847
[3249]	training's rmse: 255.51	valid_1's rmse: 319.851
[3250]	training's rmse: 255.507	valid_1's rmse: 319.853
[3251]	training's rmse: 255.501	valid_1's rmse: 319.849
[3252]	training's rmse: 255.471	valid_1's rmse: 319.871
[3253]	training's rmse: 255.459	valid_1's rmse: 319.869
[3254]	training's rmse: 255.448	valid_1's rmse: 319.87
[3255]	training's rmse: 255.442	valid_1's rmse: 319.866

[3385]	training's rmse: 253.286	valid_1's rmse: 316.742
[3386]	training's rmse: 253.286	valid_1's rmse: 316.741
[3387]	training's rmse: 253.277	valid_1's rmse: 316.745
[3388]	training's rmse: 253.277	valid_1's rmse: 316.743
[3389]	training's rmse: 253.274	valid_1's rmse: 316.746
[3390]	training's rmse: 253.27	valid_1's rmse: 316.745
[3391]	training's rmse: 253.241	valid_1's rmse: 316.761
[3392]	training's rmse: 253.233	valid_1's rmse: 316.785
[3393]	training's rmse: 253.214	valid_1's rmse: 316.786
[3394]	training's rmse: 253.205	valid_1's rmse: 316.804
[3395]	training's rmse: 253.198	valid_1's rmse: 316.823
[3396]	training's rmse: 253.19	valid_1's rmse: 316.82
[3397]	training's rmse: 253.189	valid_1's rmse: 316.821
[3398]	training's rmse: 253.161	valid_1's rmse: 316.743
[3399]	training's rmse: 253.066	valid_1's rmse: 316.563
[3400]	training's rmse: 253.062	valid_1's rmse: 316.558
[3401]	training's rmse: 253.053	valid_1's rmse: 316.562
[3402]	training's rmse: 253.044	valid_1's rmse: 316

[3532]	training's rmse: 251.053	valid_1's rmse: 314.171
[3533]	training's rmse: 251.048	valid_1's rmse: 314.182
[3534]	training's rmse: 251.039	valid_1's rmse: 314.154
[3535]	training's rmse: 251.002	valid_1's rmse: 314.051
[3536]	training's rmse: 250.997	valid_1's rmse: 314.044
[3537]	training's rmse: 250.989	valid_1's rmse: 314.07
[3538]	training's rmse: 250.983	valid_1's rmse: 314.06
[3539]	training's rmse: 250.982	valid_1's rmse: 314.06
[3540]	training's rmse: 250.976	valid_1's rmse: 314.065
[3541]	training's rmse: 250.972	valid_1's rmse: 314.058
[3542]	training's rmse: 250.965	valid_1's rmse: 314.048
[3543]	training's rmse: 250.958	valid_1's rmse: 314.049
[3544]	training's rmse: 250.926	valid_1's rmse: 314.072
[3545]	training's rmse: 250.924	valid_1's rmse: 314.073
[3546]	training's rmse: 250.919	valid_1's rmse: 314.078
[3547]	training's rmse: 250.89	valid_1's rmse: 313.985
[3548]	training's rmse: 250.863	valid_1's rmse: 313.951
[3549]	training's rmse: 250.753	valid_1's rmse: 313.

[3678]	training's rmse: 249.102	valid_1's rmse: 312.26
[3679]	training's rmse: 249.096	valid_1's rmse: 312.253
[3680]	training's rmse: 249.091	valid_1's rmse: 312.255
[3681]	training's rmse: 249.085	valid_1's rmse: 312.243
[3682]	training's rmse: 249.084	valid_1's rmse: 312.241
[3683]	training's rmse: 249.08	valid_1's rmse: 312.241
[3684]	training's rmse: 249.076	valid_1's rmse: 312.22
[3685]	training's rmse: 249.066	valid_1's rmse: 312.228
[3686]	training's rmse: 249.059	valid_1's rmse: 312.229
[3687]	training's rmse: 249.046	valid_1's rmse: 312.242
[3688]	training's rmse: 248.97	valid_1's rmse: 311.857
[3689]	training's rmse: 248.96	valid_1's rmse: 311.858
[3690]	training's rmse: 248.948	valid_1's rmse: 311.86
[3691]	training's rmse: 248.937	valid_1's rmse: 311.856
[3692]	training's rmse: 248.927	valid_1's rmse: 311.86
[3693]	training's rmse: 248.924	valid_1's rmse: 311.865
[3694]	training's rmse: 248.919	valid_1's rmse: 311.87
[3695]	training's rmse: 248.914	valid_1's rmse: 311.863


[3825]	training's rmse: 246.889	valid_1's rmse: 309.155
[3826]	training's rmse: 246.855	valid_1's rmse: 309.074
[3827]	training's rmse: 246.85	valid_1's rmse: 309.073
[3828]	training's rmse: 246.843	valid_1's rmse: 309.075
[3829]	training's rmse: 246.812	valid_1's rmse: 308.988
[3830]	training's rmse: 246.809	valid_1's rmse: 308.993
[3831]	training's rmse: 246.802	valid_1's rmse: 308.993
[3832]	training's rmse: 246.796	valid_1's rmse: 308.993
[3833]	training's rmse: 246.794	valid_1's rmse: 308.99
[3834]	training's rmse: 246.788	valid_1's rmse: 308.992
[3835]	training's rmse: 246.778	valid_1's rmse: 308.991
[3836]	training's rmse: 246.751	valid_1's rmse: 308.924
[3837]	training's rmse: 246.672	valid_1's rmse: 308.737
[3838]	training's rmse: 246.645	valid_1's rmse: 308.756
[3839]	training's rmse: 246.641	valid_1's rmse: 308.762
[3840]	training's rmse: 246.636	valid_1's rmse: 308.761
[3841]	training's rmse: 246.629	valid_1's rmse: 308.753
[3842]	training's rmse: 246.618	valid_1's rmse: 30

[3972]	training's rmse: 244.871	valid_1's rmse: 306.877
[3973]	training's rmse: 244.846	valid_1's rmse: 306.773
[3974]	training's rmse: 244.842	valid_1's rmse: 306.768
[3975]	training's rmse: 244.836	valid_1's rmse: 306.765
[3976]	training's rmse: 244.83	valid_1's rmse: 306.766
[3977]	training's rmse: 244.827	valid_1's rmse: 306.763
[3978]	training's rmse: 244.75	valid_1's rmse: 306.638
[3979]	training's rmse: 244.729	valid_1's rmse: 306.617
[3980]	training's rmse: 244.721	valid_1's rmse: 306.615
[3981]	training's rmse: 244.717	valid_1's rmse: 306.617
[3982]	training's rmse: 244.715	valid_1's rmse: 306.62
[3983]	training's rmse: 244.705	valid_1's rmse: 306.623
[3984]	training's rmse: 244.664	valid_1's rmse: 306.511
[3985]	training's rmse: 244.653	valid_1's rmse: 306.517
[3986]	training's rmse: 244.584	valid_1's rmse: 306.421
[3987]	training's rmse: 244.582	valid_1's rmse: 306.423
[3988]	training's rmse: 244.578	valid_1's rmse: 306.431
[3989]	training's rmse: 244.57	valid_1's rmse: 306.

9.740958452224731

In [46]:
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=0., reg_lambda=0.01, objective='mse', metric='rmse',
    max_depth=-1, learning_rate=0.05, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
)#定义LightGBM模型
model.fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)], early_stopping_rounds=100,
        eval_metric='rmse',
        # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
        categorical_feature=cate_feat,
        #sample_weight=data.loc[train_idx]['sample_weight'],
        verbose=100)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 855.934
[200]	valid_0's rmse: 808.956
[300]	valid_0's rmse: 775.974
[400]	valid_0's rmse: 747.833
[500]	valid_0's rmse: 723.967
[600]	valid_0's rmse: 708.145
[700]	valid_0's rmse: 697.435
[800]	valid_0's rmse: 687.287
[900]	valid_0's rmse: 677.249
[1000]	valid_0's rmse: 670.726
[1100]	valid_0's rmse: 664.16
[1200]	valid_0's rmse: 659.023
[1300]	valid_0's rmse: 654.743
[1400]	valid_0's rmse: 650.377
[1500]	valid_0's rmse: 647.241
[1600]	valid_0's rmse: 643.659
[1700]	valid_0's rmse: 641.159
[1800]	valid_0's rmse: 639.319
[1900]	valid_0's rmse: 637.346
[2000]	valid_0's rmse: 635.804
[2100]	valid_0's rmse: 633.931
[2200]	valid_0's rmse: 632.324
[2300]	valid_0's rmse: 630.803
[2400]	valid_0's rmse: 629.636
[2500]	valid_0's rmse: 628.666
[2600]	valid_0's rmse: 627.659
[2700]	valid_0's rmse: 627.142
[2800]	valid_0's rmse: 626.554
[2900]	valid_0's rmse: 625.456
[3000]	valid_0's rmse: 624.923
[3100]	valid_0's 

LGBMRegressor(bagging_fraction=0.75, bagging_freq=5, bagging_seed=7,
              boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.2, feature_fraction_seed=7,
              importance_type='split', learning_rate=0.01, max_bin=400,
              max_depth=-1, metric='rmse', min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=5000,
              n_jobs=-1, num_leaves=4, objective='regression',
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
              verbose=2)

In [47]:
Y_test = model.predict(X_test)
submission = pd.DataFrame({
    "id": submit_example['id'], 
    "forecastVolum": Y_test.round().astype(int)
})
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: 0 if x < 0 else x)#<0的结果变为0
submission.to_csv('rough_feature_engineer_popularity.csv', index=False)

In [174]:
from sklearn.metrics import mean_squared_error as mse
def get_predict_w(model, data, label='label', feature=[], cate_feature=[], random_state=2018, n_splits=5,
                  model_type='lgb'):
    if 'sample_weight' not in data.keys():
        data['sample_weight'] = 1
    model.random_state = random_state
    predict_label = 'predict_' + label#设定需要预测量的标签
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)#K折验证
    data[predict_label] = 0#设定预测量的初始值
    test_index = data[label].isnull()#(data[label].isnull()) | (data[label] == -1)#获得测试集索引，测试集的label都为NaN，所以利用data[label].isnull()实现索引
    train_data = data[~test_index].reset_index(drop=True)#获得训练集数据
    test_data = data[test_index]#获得测试集

    for train_idx, val_idx in kfold.split(train_data):#对训练集的K折分别训练
        model.random_state = model.random_state + 1
        #划分训练集
        train_x = train_data.loc[train_idx][feature]
        train_y = train_data.loc[train_idx][label]
        #划分测试集
        test_x = train_data.loc[val_idx][feature]
        test_y = train_data.loc[val_idx][label]
        if model_type == 'lgb':#LightGBM
            try:
                model.fit(train_x, train_y, eval_set=[(train_x, train_y),(test_x, test_y)], early_stopping_rounds=100,
                          eval_metric='mae',
                          # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)
            except:
                model.fit(train_x, train_y, eval_set=[(train_x, train_y),(test_x, test_y)], early_stopping_rounds=100,
                          eval_metric='mae',
                          # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          # categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)
        elif model_type == 'ctb':#CatBoost
            model.fit(train_x, train_y, eval_set=[(train_x, train_y),(test_x, test_y)], early_stopping_rounds=100,
                      # eval_metric='mae',
                      # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                      cat_features=cate_feature,
                      sample_weight=train_data.loc[train_idx]['sample_weight'],
                      verbose=100)
        train_data.loc[val_idx, predict_label] = model.predict(test_x)#预测交叉验证中的验证集，获得最终结果，用于后续loss结果评估
        if len(test_data) != 0:
            test_data[predict_label] = test_data[predict_label] + model.predict(test_data[feature])#叠加
    test_data[predict_label] = test_data[predict_label] / n_splits#平均获得最终结果
    print(mse(train_data[label], train_data[predict_label]) * 5, train_data[predict_label].mean(),
          test_data[predict_label].mean())

    return pd.concat([train_data, test_data], sort=True, ignore_index=True), predict_label

In [175]:
features=['adcode', 'bodyType', 'model', 'regMonth', 'regYear',
       'popularity', 'carCommentVolum', 'newsReplyVolum', 'label',
       'province_rank', 'label_last_3', 'popularity_last_3',
       'carCommentVolum_last_3', 'newsReplyVolum_last_3']
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=-1, learning_rate=0.05, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
)#定义LightGBM模型
data, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)#训练获得结果data和label

data['lgb'] = data[predict_label]#重命名结果标签

data['forecastVolum'] = data['lgb'].apply(lambda x: 0 if x < 0 else x)#<0的结果变为0
data[data.label.isnull()][['id', 'forecastVolum']].round().astype(int).to_csv('sales_feature_engineer_before.csv', index=False)#输出

Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 21.2646	valid_1's l1: 22.2191
[200]	training's l1: 12.4492	valid_1's l1: 14.3151
[300]	training's l1: 9.91448	valid_1's l1: 12.2696
[400]	training's l1: 8.69165	valid_1's l1: 11.4142
[500]	training's l1: 7.87017	valid_1's l1: 10.9206
[600]	training's l1: 7.27904	valid_1's l1: 10.7228
[700]	training's l1: 6.8195	valid_1's l1: 10.5077
[800]	training's l1: 6.43724	valid_1's l1: 10.3511
[900]	training's l1: 6.09363	valid_1's l1: 10.245
[1000]	training's l1: 5.80088	valid_1's l1: 10.1306
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 5.80088	valid_1's l1: 10.1306
Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 26.688	valid_1's l1: 28.3207
[200]	training's l1: 15.1622	valid_1's l1: 17.3678
[300]	training's l1: 11.6652	valid_1's l1: 14.3778
[400]	training's l1: 9.96512	valid_1's l1: 13.1876
[500]	training's l1: 8.97754	valid_1's l1: 12.6352
[600]	train

In [177]:
yulao1=pd.read_csv('ccf_car_sales.csv')
yulao2=pd.read_csv('ccf_car_sales_lgb.csv')
submission=pd.read_csv('sales_feature_engineer_before.csv')
#submission['forecastVolum']=((submission['forecastVolum']+yulao1['forecastVolum']+yulao2['forecastVolum'])/3.0).round().astype(int)
submission['forecastVolum']=((submission['forecastVolum']+yulao1['forecastVolum'])/2.0)#.round().astype(int)
submission.to_csv('stacking.csv', index=False)

In [67]:
submission=data[data.label.isnull()][['id', 'forecastVolum']]
submission['forecastVolum']

31680     26.461623
31681     26.450508
31682     26.477490
31683     26.830697
31684     29.571585
31685     26.391545
31686     31.087334
31687     26.470541
31688     36.728055
31689     26.467706
31690     33.460401
31691     26.517357
31692     29.705451
31693     29.947559
31694     33.262316
31695     27.590337
31696     27.208895
31697     26.517478
31698     27.636138
31699     26.441806
31700     26.519591
31701     26.463586
31702     29.714904
31703     30.417134
31704     29.793902
31705     33.797794
31706     33.770110
31707     35.297797
31708     50.179980
31709     29.939235
31710    408.531911
31711     35.670360
31712     39.191038
31713     31.370488
31714     38.000381
31715     34.340591
31716     36.823359
31717     35.640947
31718     35.300148
31719     35.147526
31720     35.352189
31721     28.990860
31722     33.910166
31723     35.341533
31724     26.406145
31725     26.417899
31726     26.416789
31727     26.438290
31728     29.253139
31729     26.386574
