In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import gc
from tqdm import tqdm, tqdm_notebook
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score
from mlxtend.regressor import StackingCVRegressor
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBRegressor
from pandas.tseries.offsets import *
from xiao_utils import months_among, f, f1

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
%matplotlib inline

## 读取数据&数据集说明
1. [训练集]历史销量数据：train_sales_data_v1.csv

|字段名称|	字段类型	|字段说明
|--|--|--|
|province	|String|	省份|
|adcode	int	省份编码
|model|	String|	车型编码|
|bodyType|	String|	车身类型|
|regYear|	int|	年|
|regMonth|	int|	月|
|salesVolume|	int|	销量|

2. [训练集]车型搜索数据：train_search_data_v1.csv

|字段名称|	字段类型	|字段说明
|--|--|--|
|province|	String|	省份|
|adcode|	int|	省份编码|
|model|	String|	车型编码|
|regYear|	int|	年|
|regMonth|	int|	月|
|popularity|	int|	搜索量|

3. [训练集]汽车垂直媒体新闻评论数据和车型评论数据：train_user_reply_data_v1.csv

该数据集包含了垂直媒体中，各车型的每月（不分地域）论坛发帖数据、每月新闻评论数据、车型下的评论数据三部分，这三个数据没有任何包含关系。

|字段名称|	字段类型	|字段说明
|--|--|--|
|model|	String|	车型编码|
|regYear|	int|	年|
|regMonth|	int|	月|
|newsReplyVolum|	int|	对车型相关新闻文章的评论数量|
|carCommentVolum|	int|	对车型的评价数量|

4. [评测集]2018年1月至4月的各车型各省份销量预测：evaluation_public.csv

|字段名称|	字段类型	|字段说明|
|--|--|--|
|id|	int|	数据的唯一标识，不可更改|
|province|	String|	省份|
|adcode|	int|	省份编码改|
|model|	String|	车型编码|
|bodyType|	String|	车身类型|
|regYear|	int|	年|
|regMonth|	int|	月|
|forecastVolum|	int|	预测销量，参赛队伍使用建立的模型得出的销量预测结果|

In [2]:
path  = './ccf_car/'

train_sales  = pd.read_csv(path+'train_sales_data.csv')#历史销量数据
train_search = pd.read_csv(path+'train_search_data.csv')#车型搜索数据
train_user   = pd.read_csv(path+'train_user_reply_data.csv')#汽车垂直媒体新闻评论数据和车型评论数据

evaluation_public = pd.read_csv(path+'evaluation_public.csv')#2018年1月至4月的各车型各省份销量预测
submit_example    = pd.read_csv(path+'submit_example.csv')

In [3]:
data = pd.concat([train_sales, evaluation_public], ignore_index=True)#合并训练集和测试集
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])#将搜索数据与销量数据融合
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])#将评论数据与销量数据、搜索数据融合
data['label'] = data['salesVolume']#训练集的销量->label
data['id'] = data['id'].fillna(0).astype(int)#训练集的数据没有id，全部补零（测试集有id，范围为0~5368），id是最后需要提交的两列数据之一（另一个是forecastVolum）
del data['salesVolume'], data['forecastVolum']#salesVolume->label,forecastVolum目前全为0没有意义，故删去
'''
num_feat = ['adcode', 'regMonth', 'regYear', 'popularity', 'carCommentVolum', 'newsReplyVolum']#number_feature数字特征
cate_feat = ['bodyType', 'model', 'province']#categlory_feature类别特征

for i in cate_feat:
    data[i] = data[i].astype('category')#都转化为类别类型
features = num_feat + cate_feat#所有特征=数字特征+类别特征
'''

"\nnum_feat = ['adcode', 'regMonth', 'regYear', 'popularity', 'carCommentVolum', 'newsReplyVolum']#number_feature数字特征\ncate_feat = ['bodyType', 'model', 'province']#categlory_feature类别特征\n\nfor i in cate_feat:\n    data[i] = data[i].astype('category')#都转化为类别类型\nfeatures = num_feat + cate_feat#所有特征=数字特征+类别特征\n"

In [4]:
# 给省份分类,依据https://www.daas-auto.com/newsDe/892.html
province1=['广东','江苏','山东','浙江','河南']
province2=['河北', '四川', '北京']
province3=['上海', '湖北', '湖南', '安徽']
province4=['辽宁', '云南', '陕西', '福建', '贵州','广西','山西','江西','重庆']
province5=['吉林', '黑龙江', '天津', '内蒙古', '新疆', '甘肃']
province6=['海南','宁夏','青海','西藏']

In [5]:
'''
data['province_rank']='0'
for i in range(len(data)):
    if data['province'][i] in province1:
        data['province_rank'][i]='1'
    elif data['province'][i] in province2:
        data['province_rank'][i]='2'
    elif data['province'][i] in province3:
        data['province_rank'][i]='3'
    elif data['province'][i] in province4:
        data['province_rank'][i]='4'
    elif data['province'][i] in province5:
        data['province_rank'][i]='5'
    elif data['province'][i] in province6:
        data['province_rank'][i]='6'
    else:
        data['province_rank'][i]='0'
data.head()
'''
data_before=pd.read_csv('./rst/data_yulao_rolling.csv')
data['province_rank']=data_before['province_rank']

In [6]:
# 时间分类
'''
data['date_block_num']=0
tmp=pd.array([0]*len(data))
cnts=[]
for year in data['regYear'].unique():
    for month in data[data['regYear']==year]['regMonth'].unique():
        cnts.append(str(year)+'_'+str(month))
for i in range(len(data)):
    data['date_block_num'][i]=cnts.index(str(data['regYear'][i])+'_'+str(data['regMonth'][i]))+1
data.sample(10)
'''
data['date_block_num'] = (data['regYear'] - 2016) * 12 + data['regMonth']

In [7]:
data.date_block_num.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)

In [8]:
holiday=pd.DataFrame()
holiday['date_block_num']=pd.Series(range(1,29))
holiday['workdays_cnt']=pd.Series([20,18,23,20,21,21,21,23,21,18,22,22, 19,19,23,19,21,22,21,23,22,17,22,21, 22,17,20,22,20,22,23,21,18,22,21]).astype('int')
holiday['holiday_cnt']=pd.Series([11,11,8,10,10,9,10,8,9,13,8,9 ,12,9,8,11,10,8,10,8,8,14,8,10, 9,11,9,10,9,10,9,8,9,13,8,10]).astype('int')
holiday.head()

Unnamed: 0,date_block_num,workdays_cnt,holiday_cnt
0,1,20,11
1,2,18,11
2,3,23,8
3,4,20,10
4,5,21,10


In [9]:
data=data.merge(holiday,on='date_block_num')

In [10]:
data.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,province_rank,date_block_num,workdays_cnt,holiday_cnt
0,310000,SUV,0,3c974920a76ac9c1,上海,1,2016,1479.0,11.0,106.0,292.0,3,1,20,11
1,530000,SUV,0,3c974920a76ac9c1,云南,1,2016,1594.0,11.0,106.0,466.0,3,1,20,11
2,150000,SUV,0,3c974920a76ac9c1,内蒙古,1,2016,1479.0,11.0,106.0,257.0,3,1,20,11
3,110000,SUV,0,3c974920a76ac9c1,北京,1,2016,2370.0,11.0,106.0,408.0,3,1,20,11
4,510000,SUV,0,3c974920a76ac9c1,四川,1,2016,3562.0,11.0,106.0,610.0,3,1,20,11


# 初步结果

In [11]:
data.columns

Index(['adcode', 'bodyType', 'id', 'model', 'province', 'regMonth', 'regYear',
       'popularity', 'carCommentVolum', 'newsReplyVolum', 'label',
       'province_rank', 'date_block_num', 'workdays_cnt', 'holiday_cnt'],
      dtype='object')

In [9]:
cate_feat=['adcode','bodyType', 'model', 'province','province_rank']
#data['province_rank'].astype('int')
#print(1)
for i in cate_feat:
    data[i] = data[i].astype('category')#都转化为类别类型

In [10]:
X_train = data[data.date_block_num <= 20].drop(['label'], axis=1)
Y_train = data[data.date_block_num <= 20]['label']
X_valid = data[(data.date_block_num > 20) & (data.date_block_num <25)].drop(['label'], axis=1)
Y_valid = data[(data.date_block_num > 20) & (data.date_block_num <25)]['label']
X_test = data[data.date_block_num >= 25].drop(['label'], axis=1)

In [129]:
ts = time.time()

model = lgb.LGBMRegressor(objective='regression', 
                    metric='rmse',
                    num_leaves=4,
                    learning_rate=0.05, 
                    n_estimators=5000,
                    max_bin=400, 
                    bagging_fraction=0.75,
                    bagging_freq=5, 
                    bagging_seed=7,
                    feature_fraction=0.2,
                    feature_fraction_seed=7,
                    verbose=-1
                    #min_data_in_leaf=2,
                    #min_sum_hessian_in_leaf=11
                         )

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    categorical_feature=cate_feat,
    verbose=True, 
    early_stopping_rounds = 50)

time.time() - ts

[1]	training's rmse: 1.02201	valid_1's rmse: 1.09981
Training until validation scores don't improve for 50 rounds.
[2]	training's rmse: 1.00567	valid_1's rmse: 1.08281
[3]	training's rmse: 0.990541	valid_1's rmse: 1.06755
[4]	training's rmse: 0.978822	valid_1's rmse: 1.05665
[5]	training's rmse: 0.96678	valid_1's rmse: 1.04533
[6]	training's rmse: 0.953135	valid_1's rmse: 1.03155
[7]	training's rmse: 0.947768	valid_1's rmse: 1.02625
[8]	training's rmse: 0.931204	valid_1's rmse: 1.00908
[9]	training's rmse: 0.914899	valid_1's rmse: 0.992673
[10]	training's rmse: 0.91031	valid_1's rmse: 0.988124
[11]	training's rmse: 0.899568	valid_1's rmse: 0.97718
[12]	training's rmse: 0.896003	valid_1's rmse: 0.975142
[13]	training's rmse: 0.883613	valid_1's rmse: 0.962281
[14]	training's rmse: 0.873918	valid_1's rmse: 0.952465
[15]	training's rmse: 0.870923	valid_1's rmse: 0.95038
[16]	training's rmse: 0.859624	valid_1's rmse: 0.938512
[17]	training's rmse: 0.85108	valid_1's rmse: 0.929143
[18]	train

[290]	training's rmse: 0.52267	valid_1's rmse: 0.596198
[291]	training's rmse: 0.522631	valid_1's rmse: 0.595912
[292]	training's rmse: 0.522391	valid_1's rmse: 0.59556
[293]	training's rmse: 0.522292	valid_1's rmse: 0.595442
[294]	training's rmse: 0.522258	valid_1's rmse: 0.595436
[295]	training's rmse: 0.522036	valid_1's rmse: 0.595103
[296]	training's rmse: 0.521838	valid_1's rmse: 0.594896
[297]	training's rmse: 0.521628	valid_1's rmse: 0.59457
[298]	training's rmse: 0.521598	valid_1's rmse: 0.594518
[299]	training's rmse: 0.521523	valid_1's rmse: 0.594388
[300]	training's rmse: 0.521328	valid_1's rmse: 0.594092
[301]	training's rmse: 0.521261	valid_1's rmse: 0.593954
[302]	training's rmse: 0.521228	valid_1's rmse: 0.593748
[303]	training's rmse: 0.521201	valid_1's rmse: 0.593734
[304]	training's rmse: 0.520973	valid_1's rmse: 0.593463
[305]	training's rmse: 0.520771	valid_1's rmse: 0.593279
[306]	training's rmse: 0.520694	valid_1's rmse: 0.593088
[307]	training's rmse: 0.520674	va

[578]	training's rmse: 0.491737	valid_1's rmse: 0.564771
[579]	training's rmse: 0.491617	valid_1's rmse: 0.564622
[580]	training's rmse: 0.491583	valid_1's rmse: 0.564649
[581]	training's rmse: 0.491507	valid_1's rmse: 0.564625
[582]	training's rmse: 0.491368	valid_1's rmse: 0.56443
[583]	training's rmse: 0.490842	valid_1's rmse: 0.563864
[584]	training's rmse: 0.490736	valid_1's rmse: 0.563723
[585]	training's rmse: 0.490691	valid_1's rmse: 0.56357
[586]	training's rmse: 0.490664	valid_1's rmse: 0.563539
[587]	training's rmse: 0.490634	valid_1's rmse: 0.56352
[588]	training's rmse: 0.490335	valid_1's rmse: 0.562746
[589]	training's rmse: 0.490086	valid_1's rmse: 0.562107
[590]	training's rmse: 0.489889	valid_1's rmse: 0.561879
[591]	training's rmse: 0.489872	valid_1's rmse: 0.561854
[592]	training's rmse: 0.489828	valid_1's rmse: 0.561901
[593]	training's rmse: 0.489813	valid_1's rmse: 0.561961
[594]	training's rmse: 0.489803	valid_1's rmse: 0.561984
[595]	training's rmse: 0.489793	va

[866]	training's rmse: 0.468803	valid_1's rmse: 0.547659
[867]	training's rmse: 0.468781	valid_1's rmse: 0.547659
[868]	training's rmse: 0.468747	valid_1's rmse: 0.547633
[869]	training's rmse: 0.468708	valid_1's rmse: 0.547642
[870]	training's rmse: 0.468665	valid_1's rmse: 0.547628
[871]	training's rmse: 0.4686	valid_1's rmse: 0.547667
[872]	training's rmse: 0.468538	valid_1's rmse: 0.547678
[873]	training's rmse: 0.46843	valid_1's rmse: 0.547681
[874]	training's rmse: 0.468402	valid_1's rmse: 0.547709
[875]	training's rmse: 0.468348	valid_1's rmse: 0.547717
[876]	training's rmse: 0.468329	valid_1's rmse: 0.547625
[877]	training's rmse: 0.468313	valid_1's rmse: 0.547602
[878]	training's rmse: 0.468291	valid_1's rmse: 0.547567
[879]	training's rmse: 0.468277	valid_1's rmse: 0.547529
[880]	training's rmse: 0.468266	valid_1's rmse: 0.547502
[881]	training's rmse: 0.468184	valid_1's rmse: 0.547449
[882]	training's rmse: 0.468169	valid_1's rmse: 0.547533
[883]	training's rmse: 0.468123	va

[1151]	training's rmse: 0.451906	valid_1's rmse: 0.535898
[1152]	training's rmse: 0.451889	valid_1's rmse: 0.535873
[1153]	training's rmse: 0.451874	valid_1's rmse: 0.535564
[1154]	training's rmse: 0.451788	valid_1's rmse: 0.53543
[1155]	training's rmse: 0.451762	valid_1's rmse: 0.535379
[1156]	training's rmse: 0.451742	valid_1's rmse: 0.535315
[1157]	training's rmse: 0.451432	valid_1's rmse: 0.535107
[1158]	training's rmse: 0.451412	valid_1's rmse: 0.535095
[1159]	training's rmse: 0.451399	valid_1's rmse: 0.535083
[1160]	training's rmse: 0.451373	valid_1's rmse: 0.535031
[1161]	training's rmse: 0.45137	valid_1's rmse: 0.535002
[1162]	training's rmse: 0.45108	valid_1's rmse: 0.534881
[1163]	training's rmse: 0.451073	valid_1's rmse: 0.534858
[1164]	training's rmse: 0.450679	valid_1's rmse: 0.534539
[1165]	training's rmse: 0.45042	valid_1's rmse: 0.534434
[1166]	training's rmse: 0.45036	valid_1's rmse: 0.534366
[1167]	training's rmse: 0.450346	valid_1's rmse: 0.534363
[1168]	training's r

[1293]	training's rmse: 0.44535	valid_1's rmse: 0.53169
[1294]	training's rmse: 0.445346	valid_1's rmse: 0.531752
[1295]	training's rmse: 0.445313	valid_1's rmse: 0.531766
[1296]	training's rmse: 0.445303	valid_1's rmse: 0.531704
[1297]	training's rmse: 0.4453	valid_1's rmse: 0.531707
[1298]	training's rmse: 0.445214	valid_1's rmse: 0.531635
[1299]	training's rmse: 0.445161	valid_1's rmse: 0.531633
[1300]	training's rmse: 0.445158	valid_1's rmse: 0.531635
[1301]	training's rmse: 0.445121	valid_1's rmse: 0.53142
[1302]	training's rmse: 0.445119	valid_1's rmse: 0.531435
[1303]	training's rmse: 0.4451	valid_1's rmse: 0.531449
[1304]	training's rmse: 0.444901	valid_1's rmse: 0.531347
[1305]	training's rmse: 0.44489	valid_1's rmse: 0.531344
[1306]	training's rmse: 0.444827	valid_1's rmse: 0.531268
[1307]	training's rmse: 0.444771	valid_1's rmse: 0.531199
[1308]	training's rmse: 0.444757	valid_1's rmse: 0.531201
[1309]	training's rmse: 0.444519	valid_1's rmse: 0.530924
[1310]	training's rmse

[1575]	training's rmse: 0.432669	valid_1's rmse: 0.525001
[1576]	training's rmse: 0.432653	valid_1's rmse: 0.524975
[1577]	training's rmse: 0.432636	valid_1's rmse: 0.524943
[1578]	training's rmse: 0.432571	valid_1's rmse: 0.524829
[1579]	training's rmse: 0.432542	valid_1's rmse: 0.524808
[1580]	training's rmse: 0.432049	valid_1's rmse: 0.524371
[1581]	training's rmse: 0.432044	valid_1's rmse: 0.524382
[1582]	training's rmse: 0.432012	valid_1's rmse: 0.524342
[1583]	training's rmse: 0.431968	valid_1's rmse: 0.524376
[1584]	training's rmse: 0.431959	valid_1's rmse: 0.524378
[1585]	training's rmse: 0.431804	valid_1's rmse: 0.52429
[1586]	training's rmse: 0.431778	valid_1's rmse: 0.524289
[1587]	training's rmse: 0.431778	valid_1's rmse: 0.524334
[1588]	training's rmse: 0.431776	valid_1's rmse: 0.52438
[1589]	training's rmse: 0.431731	valid_1's rmse: 0.524436
[1590]	training's rmse: 0.431722	valid_1's rmse: 0.524389
[1591]	training's rmse: 0.431672	valid_1's rmse: 0.524343
[1592]	training'

[1857]	training's rmse: 0.420158	valid_1's rmse: 0.516234
[1858]	training's rmse: 0.420139	valid_1's rmse: 0.51621
[1859]	training's rmse: 0.420113	valid_1's rmse: 0.516235
[1860]	training's rmse: 0.4201	valid_1's rmse: 0.51624
[1861]	training's rmse: 0.420099	valid_1's rmse: 0.516271
[1862]	training's rmse: 0.420091	valid_1's rmse: 0.516257
[1863]	training's rmse: 0.420091	valid_1's rmse: 0.516231
[1864]	training's rmse: 0.419928	valid_1's rmse: 0.516045
[1865]	training's rmse: 0.419906	valid_1's rmse: 0.516081
[1866]	training's rmse: 0.419887	valid_1's rmse: 0.516095
[1867]	training's rmse: 0.419839	valid_1's rmse: 0.516107
[1868]	training's rmse: 0.419836	valid_1's rmse: 0.516103
[1869]	training's rmse: 0.419832	valid_1's rmse: 0.51607
[1870]	training's rmse: 0.41974	valid_1's rmse: 0.516084
[1871]	training's rmse: 0.419739	valid_1's rmse: 0.516084
[1872]	training's rmse: 0.419739	valid_1's rmse: 0.516085
[1873]	training's rmse: 0.419729	valid_1's rmse: 0.516129
[1874]	training's rm

[2139]	training's rmse: 0.410707	valid_1's rmse: 0.512079
[2140]	training's rmse: 0.41069	valid_1's rmse: 0.512042
[2141]	training's rmse: 0.410688	valid_1's rmse: 0.512034
Early stopping, best iteration is:
[2091]	training's rmse: 0.41181	valid_1's rmse: 0.512012


4.696442365646362

In [130]:
Y_test = np.expm1(model.predict(X_test))

In [131]:
submission = pd.DataFrame({
    "id": submit_example['id'], 
    "forecastVolum": Y_test.round().astype(int)
})
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: 10 if x < 0 else x)#<0的结果变为0
submission.to_csv('rough_feature_engineer_log_replace.csv', index=False)

# 进一步构造特征

In [12]:
# adcode与省份一一对应，删去province
data.drop('province',axis=1,inplace=True)

In [13]:
data.head()

Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,province_rank,date_block_num,workdays_cnt,holiday_cnt
0,310000,SUV,0,3c974920a76ac9c1,1,2016,1479.0,11.0,106.0,292.0,3,1,20,11
1,530000,SUV,0,3c974920a76ac9c1,1,2016,1594.0,11.0,106.0,466.0,3,1,20,11
2,150000,SUV,0,3c974920a76ac9c1,1,2016,1479.0,11.0,106.0,257.0,3,1,20,11
3,110000,SUV,0,3c974920a76ac9c1,1,2016,2370.0,11.0,106.0,408.0,3,1,20,11
4,510000,SUV,0,3c974920a76ac9c1,1,2016,3562.0,11.0,106.0,610.0,3,1,20,11


In [14]:
data.bodyType.unique()

array(['SUV', 'Sedan', 'MPV', 'Hatchback', nan], dtype=object)

In [15]:
SUV_model=data[data.bodyType=='SUV'].model.unique()

In [16]:
Sedan_model=data[data.bodyType=='Sedan'].model.unique()

In [17]:
MPV_model=data[data.bodyType=='MPV'].model.unique()

In [18]:
Hatchback_model=data[data.bodyType=='Hatchback'].model.unique()

上述型号没有交集，将测试集中的NaN补上，逆转上述字典，得到model->bodyType

In [19]:
model2body=dict([(x,'SUV') for x in SUV_model])
model2body=dict(dict([(x,'Sedan') for x in Sedan_model]),**model2body)
model2body=dict(dict([(x,'MPV') for x in MPV_model]),**model2body)
model2body=dict(dict([(x,'Hatchback') for x in Hatchback_model]),**model2body)

In [21]:
ts=time.time()
test_index=data[data['id']!=0].index
for i in test_index:
    data.loc[i,'bodyType']=model2body.get(data.iloc[i]['model'])
time.time()-ts

8.294971704483032

## 补上测试集的缺失

In [30]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','adcode','model',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','adcode','model', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','adcode','model'], how='left')
    return df

In [29]:
data.sample()

Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,province_rank,date_block_num,class_id
30320,510000,Sedan,0,c6cd4e0e073f5ac2,11,2017,459.0,363.0,1946.0,307.0,2,23,8366


In [30]:
ts = time.time()
data = lag_feature(data, [4,5,6], 'label')
for i in data[data.id!=0].index:
    data.loc[i,'label']=(data.loc[i,'label_lag_4']*0.6+data.loc[i,'label_lag_5']*0.3+data.loc[i,'label_lag_6']*0.1)
data = lag_feature(data, [1,2,3], 'label')
time.time() - ts

2.4693970680236816

In [31]:
# popularity
ts = time.time()
data = lag_feature(data, [4,5,6], 'popularity')
for i in data[data.id!=0].index:
    data.loc[i,'popularity']=(data.loc[i,'popularity_lag_4']*0.6+data.loc[i,'popularity_lag_5']*0.3+data.loc[i,'popularity_lag_6']*0.1)
data = lag_feature(data, [1,2,3], 'popularity')
time.time() - ts

2.435487985610962

In [32]:
# carCommentVolum
ts = time.time()
data = lag_feature(data, [4,5,6], 'carCommentVolum')
for i in data[data.id!=0].index:
    data.loc[i,'carCommentVolum']=(data.loc[i,'carCommentVolum_lag_4']*0.6+data.loc[i,'carCommentVolum_lag_5']*0.3+data.loc[i,'carCommentVolum_lag_6']*0.1)
data = lag_feature(data, [1,2,3], 'carCommentVolum')
time.time() - ts

2.4614169597625732

In [33]:
# newsReplyVolum
ts = time.time()
# 填补测试集
data = lag_feature(data, [4,5,6], 'newsReplyVolum')
for i in data[data.id!=0].index:
    data.loc[i,'newsReplyVolum']=(data.loc[i,'newsReplyVolum_lag_4']*0.6+data.loc[i,'newsReplyVolum_lag_5']*0.3+data.loc[i,'newsReplyVolum_lag_6']*0.1)
# 在所有数据上取均值
data = lag_feature(data, [1,2,3], 'newsReplyVolum')
time.time() - ts

2.4833602905273438

In [23]:
ts = time.time()
month1_index=data[data['date_block_num']==1].index
month2_index=data[data['date_block_num']==2].index
month3_index=data[data['date_block_num']==3].index
for i in month1_index:
    data.loc[i,'label_last_3']=data.loc[i,'label']
    data.loc[i,'popularity_last_3']=data.loc[i,'popularity']
    data.loc[i,'carCommentVolum_last_3']=data.loc[i,'carCommentVolum']
    data.loc[i,'newsReplyVolum_last_3']=data.loc[i,'newsReplyVolum']
for i in month2_index:
    data.loc[i,'label_last_3']=(data.loc[i,'label']+data.loc[i,'label_lag_1'])/2.0
    data.loc[i,'popularity_last_3']=(data.loc[i,'popularity']+data.loc[i,'popularity_lag_1'])/2.0
    data.loc[i,'carCommentVolum_last_3']=(data.loc[i,'carCommentVolum']+data.loc[i,'carCommentVolum_lag_1'])/2.0
    data.loc[i,'newsReplyVolum_last_3']=(data.loc[i,'newsReplyVolum']+data.loc[i,'newsReplyVolum_lag_1'])/2.0
for i in month2_index:
    data.loc[i,'label_last_3']=(data.loc[i,'label']+data.loc[i,'label_lag_1']+data.loc[i,'label_lag_2'])/3.0
    data.loc[i,'popularity_last_3']=(data.loc[i,'popularity']+data.loc[i,'popularity_lag_1']+data.loc[i,'popularity_lag_2'])/3.0
    data.loc[i,'carCommentVolum_last_3']=(data.loc[i,'carCommentVolum']+data.loc[i,'carCommentVolum_lag_1']+data.loc[i,'carCommentVolum_lag_2'])/3.0
    data.loc[i,'newsReplyVolum_last_3']=(data.loc[i,'newsReplyVolum']+data.loc[i,'newsReplyVolum_lag_1']+data.loc[i,'newsReplyVolum_lag_2'])/3.0
other_index=data[(data.date_block_num!=1)&(data.date_block_num!=2)&(data.date_block_num!=3)].index
for i in other_index:
    data.loc[i,'label_last_3']=(data.loc[i,'label_lag_3']+data.loc[i,'label_lag_1']+data.loc[i,'label_lag_2'])/3.0
    data.loc[i,'popularity_last_3']=(data.loc[i,'popularity_lag_3']+data.loc[i,'popularity_lag_1']+data.loc[i,'popularity_lag_2'])/3.0
    data.loc[i,'carCommentVolum_last_3']=(data.loc[i,'carCommentVolum_lag_3']+data.loc[i,'carCommentVolum_lag_1']+data.loc[i,'carCommentVolum_lag_2'])/3.0
    data.loc[i,'newsReplyVolum_last_3']=(data.loc[i,'newsReplyVolum_lag_3']+data.loc[i,'newsReplyVolum_lag_1']+data.loc[i,'newsReplyVolum_lag_2'])/3.0
time.time() - ts

65.83599662780762

In [24]:
data.drop(['label_lag_4', 'label_lag_5',
       'label_lag_6', 'label_lag_1', 'label_lag_2', 'label_lag_3',
       'popularity_lag_4', 'popularity_lag_5', 'popularity_lag_6',
       'popularity_lag_1', 'popularity_lag_2', 'popularity_lag_3',
       'carCommentVolum_lag_4', 'carCommentVolum_lag_5',
       'carCommentVolum_lag_6', 'carCommentVolum_lag_1',
       'carCommentVolum_lag_2', 'carCommentVolum_lag_3',
       'newsReplyVolum_lag_4', 'newsReplyVolum_lag_5', 'newsReplyVolum_lag_6',
       'newsReplyVolum_lag_1', 'newsReplyVolum_lag_2', 'newsReplyVolum_lag_3'],axis=1,inplace=True)

## 滑窗
学习鱼佬：https://nbviewer.jupyter.org/github/bettenW/Automobile-sale-predict/blob/master/sale_quantity_feature.ipynb

In [23]:
# 给所有不同车型、不同省份、不同时间的数据编上class_id
ts = time.time()
data['class_id']=0
class_dict={}
adcode_list=data.adcode.unique()
model_list=data.model.unique()
cnt=0
for adcode in adcode_list:
    for model in model_list:
        cnt+=1
        class_dict[str(adcode)+'_'+str(model)]=cnt
for i in data.index:
    data.loc[i,'class_id']=class_dict.get(str(data.loc[i,'adcode'])+'_'+str(data.loc[i,'model'])) 
class_id=data.class_id.unique()
time.time() - ts

50.80874037742615

In [24]:
def rolling(df,func,window):
    #last=roll(df,window=window).tolist()
    if func=='sum':
        last=df.rolling(window).sum().tolist()
    elif func=='mean':
        last=df.rolling(window).mean().tolist()
    elif func=='std':
        last=df.rolling(window).std().tolist()
    else:
        print('WRONG')
    last=[np.NaN]+last
    last.pop()
    return last

In [25]:
ts = time.time()
new = pd.DataFrame()
for a in class_id:
    df = data[data['class_id'] == a]
    df['last_1_sum'] = rolling(df['label'], 'sum', window=1)
    df['last_2_sum'] = rolling(df['label'], 'sum', window=2)
    df['last_3_sum'] = rolling(df['label'], 'sum', window=3)
    df['last_4_sum'] = rolling(df['label'], 'sum', window=4)
    df['last_5_sum'] = rolling(df['label'], 'sum', window=5)
    df['last_6_sum'] = rolling(df['label'], 'sum', window=6)
    df['last_7_sum'] = rolling(df['label'], 'sum', window=7)
    df['last_8_sum'] = rolling(df['label'], 'sum', window=8)
    df['last_9_sum'] = rolling(df['label'], 'sum', window=9)
    df['last_10_sum'] = rolling(df['label'], 'sum', window=10)
    df['last_11_sum'] = rolling(df['label'], 'sum', window=11)
    df['last_12_sum'] = rolling(df['label'], 'sum', window=12)
    df['last_13_sum'] = rolling(df['label'], 'sum', window=13)
    
    df['last_5_mean'] = rolling(df['label'], 'mean', window=5)
    
    df['last_5_std'] = rolling(df['label'], 'std', window=5)
    
    df['move_1'] = df['last_1_sum']
    df['move_2'] = df['last_2_sum'] - df['last_1_sum']
    df['move_3'] = df['last_3_sum'] - df['last_2_sum']
    df['move_4'] = df['last_4_sum'] - df['last_3_sum']
    df['move_5'] = df['last_5_sum'] - df['last_4_sum']
    
    df['move_9'] = df['last_9_sum'] - df['last_8_sum']
    
    df['move_11'] = df['last_11_sum'] - df['last_10_sum']
    df['move_12'] = df['last_12_sum'] - df['last_11_sum']
    df['move_13'] = df['last_13_sum'] - df['last_12_sum']
    
    
    
    #df['diff_12'] = df['move_11'] - df['move_12']
    #df['diff_13'] = df['move_11'] - df['move_13']
    
    #对于12月份的滑动值特殊处理,因为不存才201711的特征
    #diff = df['move_11'][df['sale_month']==12] - df['move_13'][df['sale_month']==12]
    #df['diff_12'][df['sale_month']==12] = diff
    #move = df['last_11_sum'][df['sale_month']==12] - df['last_10_sum'][df['sale_month']==12]
    #df['move_12'][df['sale_month']==12] = move
    
    new = pd.concat([new, df])
time.time() - ts

KeyboardInterrupt: 

In [None]:
data=new.copy()
del new

In [None]:
data.columns

In [95]:
data.drop(['label_lag_4', 'label_lag_5',
       'label_lag_6', 'label_lag_1', 'label_lag_2', 'label_lag_3',
       'popularity_lag_4', 'popularity_lag_5', 'popularity_lag_6',
       'popularity_lag_1', 'popularity_lag_2', 'popularity_lag_3',
       'carCommentVolum_lag_4', 'carCommentVolum_lag_5',
       'carCommentVolum_lag_6', 'carCommentVolum_lag_1',
       'carCommentVolum_lag_2', 'carCommentVolum_lag_3',
       'newsReplyVolum_lag_4', 'newsReplyVolum_lag_5', 'newsReplyVolum_lag_6',
       'newsReplyVolum_lag_1', 'newsReplyVolum_lag_2', 'newsReplyVolum_lag_3',
       'last_1_sum', 'last_2_sum', 'last_3_sum', 'last_4_sum', 'last_5_sum',
       'last_6_sum', 'last_7_sum', 'last_8_sum', 'last_9_sum', 'last_10_sum',
       'last_11_sum', 'last_12_sum', 'last_13_sum'],axis=1,inplace=True)

KeyError: "['label_lag_4' 'label_lag_5' 'label_lag_6' 'label_lag_1' 'label_lag_2'\n 'label_lag_3' 'popularity_lag_4' 'popularity_lag_5' 'popularity_lag_6'\n 'popularity_lag_1' 'popularity_lag_2' 'popularity_lag_3'\n 'carCommentVolum_lag_4' 'carCommentVolum_lag_5' 'carCommentVolum_lag_6'\n 'carCommentVolum_lag_1' 'carCommentVolum_lag_2' 'carCommentVolum_lag_3'\n 'newsReplyVolum_lag_4' 'newsReplyVolum_lag_5' 'newsReplyVolum_lag_6'\n 'newsReplyVolum_lag_1' 'newsReplyVolum_lag_2' 'newsReplyVolum_lag_3'\n 'last_1_sum' 'last_2_sum' 'last_3_sum' 'last_4_sum' 'last_5_sum'\n 'last_6_sum' 'last_7_sum' 'last_8_sum' 'last_9_sum' 'last_10_sum'\n 'last_11_sum' 'last_12_sum' 'last_13_sum'] not found in axis"

In [97]:
data.columns

Index(['adcode', 'bodyType', 'id', 'model', 'regMonth', 'regYear',
       'popularity', 'carCommentVolum', 'newsReplyVolum', 'label',
       'province_rank', 'date_block_num', 'class_id', 'last_5_mean',
       'last_5_std', 'move_1', 'move_2', 'move_3', 'move_4', 'move_5'],
      dtype='object')

In [99]:
data.sample(30)

Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,province_rank,date_block_num,class_id,last_5_mean,last_5_std,move_1,move_2,move_3,move_4,move_5
19490,330000,MPV,0,79de4e4b24c35b04,3,2016,7844.0,120.0,3704.0,867.0,1,3,872,,,626.0,1860.0,,,
15355,610000,SUV,0,5d7fb682edd0f937,12,2016,737.0,135.0,124.0,145.0,4,12,1225,173.0,27.267196,135.0,172.0,192.0,205.0,161.0
8639,310000,Sedan,0,28e29f2c03dcd84c,4,2017,487.0,231.0,478.0,667.0,3,16,15,829.6,307.125056,764.0,433.0,851.0,1293.0,807.0
14485,360000,Sedan,0,02aab221aabc03b9,10,2016,247.0,649.0,4699.0,164.0,4,10,684,199.2,28.960318,232.0,224.0,198.0,165.0,177.0
8977,130000,Sedan,0,28e29f2c03dcd84c,6,2017,869.0,176.0,994.0,1539.0,2,18,735,1403.8,306.927842,1631.0,1523.0,1377.0,884.0,1604.0
31641,440000,SUV,0,54fc07138d70374c,2,2016,6838.0,10.0,70.0,437.0,1,2,532,,,1965.0,,,,
28660,360000,Sedan,0,06880909932890ca,5,2017,585.0,178.0,368.0,326.0,4,17,707,389.4,245.235193,274.0,211.0,157.0,614.0,691.0
27202,110000,SUV,0,5b1c11c3efed5312,3,2017,620.0,19.0,32.0,51.0,2,15,225,161.2,129.329811,41.0,71.0,370.0,181.0,143.0
24540,210000,SUV,0,cc21c7e91a3b5a0c,1,2017,1682.0,114.0,677.0,285.0,4,13,1120,468.6,81.08514,602.0,480.0,390.0,437.0,434.0
23259,430000,Sedan,0,0797526c057dcf5b,8,2017,1667.0,793.0,3667.0,1858.0,3,20,998,1392.6,294.270794,1786.0,1476.0,1503.0,1093.0,1105.0


In [27]:
tmp=pd.read_csv('./rst/data_yulao_rolling.csv')
data['last_5_mean']=tmp['last_5_mean']
data['last_5_std']=tmp['last_5_std']
data['move_1']=tmp['move_1']
data['move_2']=tmp['move_2']
data['move_3']=tmp['move_3']
data['move_4']=tmp['move_4']
data['move_5']=tmp['move_5']

   ## 差分

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','adcode','model',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','adcode','model', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','adcode','model'], how='left')
    return df

In [31]:
data=lag_feature(data,[1,2,3,12,24],'label')

In [40]:
# 一阶差分，一阶比值
data['cha1_1']=data['label']-data['label_lag_1']
data['bi1_1' ]=data['label']/data['label_lag_1']
data['cha1_2']=data['label_lag_1']-data['label_lag_2']
data['bi1_2' ]=data['label_lag_1']/data['label_lag_2']
# 二阶差分
data['cha2_1']=data['cha1_2']-data['cha1_1']
# 二阶比值
data['bi2_1']=data['bi1_2']-data['bi1_1']
# 比值的差分
data['cha1_bi_1']=data['bi1_2' ]-data['bi1_1' ]
# 差分的比值
data['bi1_cha_1']=data['cha1_2' ]/data['cha1_1' ]
# 相邻年，一阶差分，一阶比值
data['cha1_1_y']=data['label']-data['label_lag_12']
data['bi1_1_y' ]=data['label']/data['label_lag_12']
data['cha1_2_y']=data['label_lag_12']-data['label_lag_24']
data['bi1_2_y' ]=data['label_lag_12']/data['label_lag_24']
# 相邻年，二阶差分
data['cha2_1_y']=data['cha1_2_y']-data['cha1_1_y']
# 相邻年，二阶比值
data['bi2_1_y' ]=data['bi1_2_y' ]/data['bi1_1_y' ]
# 相邻年，比值的差分
data['cha1_bi1_1_y']=data['bi1_2_y' ]-data['bi1_1_y' ]
# 相邻年，差分的比值
data['bi1_cha1_1_y']=data['cha1_2_y']/data['cha1_1_y']

In [42]:
data.to_csv('./rst/data_yulao_rolling_chabi.csv',index=False)

# 训练

### LabelEncoder

In [46]:
data.drop(['label_lag_1','label_lag_2','label_lag_3','label_lag_12','label_lag_24'],axis=1,inplace=True)

In [68]:
ts=time.time()
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data_le=data.copy()
for i in cate_feat:
    data_le[i]=le.fit_transform(data_le[i])
time.time()-ts

0.0209503173828125

In [70]:
X_train = data_le[data_le.date_block_num <= 20].drop(['label'], axis=1)
Y_train = data_le[data_le.date_block_num <= 20]['label']
X_valid = data_le[(data_le.date_block_num > 20) & (data_le.date_block_num <25)].drop(['label'], axis=1)
Y_valid = data_le[(data_le.date_block_num > 20) & (data_le.date_block_num <25)]['label']
X_train_set = data_le[data_le.date_block_num < 25].drop(['label'], axis=1)
Y_train_set = data_le[data_le.date_block_num < 25]['label']
X_test = data_le[data_le.date_block_num >= 25].drop(['label'], axis=1)

In [108]:
X_test=X_test.sort_index(by=['id'],ascending=True)

### category

In [51]:
cate_feat=['adcode','bodyType', 'model','province_rank','regMonth','regYear','class_id']#,'date_block_num']
#data['province_rank'].astype('int')
#print(1)
for i in cate_feat:
    data[i] = data[i].astype('category')#都转化为类别类型
features=['popularity', 'carCommentVolum', 'newsReplyVolum', 'workdays_cnt', 
       'holiday_cnt', 'last_5_mean', 'last_5_std', 'move_1', 'move_2', 'move_3',
       'move_4', 'move_5', 'cha1_1', 'bi1_1', 'cha1_2', 'bi1_2', 'cha2_1',
       'bi2_1', 'cha1_bi_1', 'bi1_cha_1', 'cha1_1_y', 'bi1_1_y', 'cha1_2_y',
       'bi1_2_y', 'cha2_1_y', 'bi2_1_y', 'cha1_bi1_1_y', 'bi1_cha1_1_y']

In [52]:
X_train = data[data.date_block_num <= 20][features]
Y_train = data[data.date_block_num <= 20]['label']
X_valid = data[(data.date_block_num > 20) & (data.date_block_num <25)][features]
Y_valid = data[(data.date_block_num > 20) & (data.date_block_num <25)]['label']
X_train_set = data[data.date_block_num < 25][features]
Y_train_set = data[data.date_block_num < 25]['label']
X_test = data[data.date_block_num >= 25][features]

In [53]:
data['date_block_num'].astype('category')
print(1)

1


## 初级

In [170]:
ts = time.time()

model = lgb.LGBMRegressor(objective='regression', metric='rmse',
                    num_leaves=4,learning_rate=0.01, n_estimators=5000,
                    max_bin=400, bagging_fraction=0.75,bagging_freq=5, 
                    bagging_seed=7,feature_fraction=0.2,feature_fraction_seed=7,
                    verbose=2
                    #min_data_in_leaf=2,
                    #min_sum_hessian_in_leaf=11
                    )
model.fit(X_train, Y_train, eval_metric="rmse", 
          eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
          categorical_feature=cate_feat,verbose=True, 
          early_stopping_rounds = 50)
time.time() - ts

[1]	training's rmse: 745.725	valid_1's rmse: 921.781
Training until validation scores don't improve for 50 rounds.
[2]	training's rmse: 744.366	valid_1's rmse: 920.372
[3]	training's rmse: 744.088	valid_1's rmse: 920.083
[4]	training's rmse: 743.767	valid_1's rmse: 919.54
[5]	training's rmse: 738.576	valid_1's rmse: 913.706
[6]	training's rmse: 737.288	valid_1's rmse: 912.002
[7]	training's rmse: 735.617	valid_1's rmse: 910.41
[8]	training's rmse: 734.216	valid_1's rmse: 909.151
[9]	training's rmse: 733.951	valid_1's rmse: 908.959
[10]	training's rmse: 732.795	valid_1's rmse: 907.891
[11]	training's rmse: 730.956	valid_1's rmse: 906.035
[12]	training's rmse: 729.214	valid_1's rmse: 903.45
[13]	training's rmse: 727.536	valid_1's rmse: 901.662
[14]	training's rmse: 726.413	valid_1's rmse: 900.84
[15]	training's rmse: 724.762	valid_1's rmse: 899.079
[16]	training's rmse: 723.143	valid_1's rmse: 897.35
[17]	training's rmse: 722.535	valid_1's rmse: 896.924
[18]	training's rmse: 721.392	vali

[300]	training's rmse: 456.409	valid_1's rmse: 597.014
[301]	training's rmse: 456.284	valid_1's rmse: 597.055
[302]	training's rmse: 456.216	valid_1's rmse: 596.996
[303]	training's rmse: 456.135	valid_1's rmse: 596.899
[304]	training's rmse: 455.914	valid_1's rmse: 596.604
[305]	training's rmse: 455.759	valid_1's rmse: 596.402
[306]	training's rmse: 455.553	valid_1's rmse: 596.117
[307]	training's rmse: 455.13	valid_1's rmse: 595.599
[308]	training's rmse: 454.942	valid_1's rmse: 595.626
[309]	training's rmse: 454.718	valid_1's rmse: 595.373
[310]	training's rmse: 452.875	valid_1's rmse: 592.941
[311]	training's rmse: 452.754	valid_1's rmse: 592.982
[312]	training's rmse: 451.083	valid_1's rmse: 590.727
[313]	training's rmse: 451.016	valid_1's rmse: 590.811
[314]	training's rmse: 450.759	valid_1's rmse: 590.503
[315]	training's rmse: 450.364	valid_1's rmse: 590.023
[316]	training's rmse: 450.247	valid_1's rmse: 590.055
[317]	training's rmse: 449.82	valid_1's rmse: 589.224
[318]	traini

[598]	training's rmse: 375.048	valid_1's rmse: 484.532
[599]	training's rmse: 374.217	valid_1's rmse: 483.902
[600]	training's rmse: 374.193	valid_1's rmse: 483.879
[601]	training's rmse: 374.11	valid_1's rmse: 483.878
[602]	training's rmse: 374.075	valid_1's rmse: 483.958
[603]	training's rmse: 374.052	valid_1's rmse: 483.925
[604]	training's rmse: 374.025	valid_1's rmse: 483.863
[605]	training's rmse: 373.985	valid_1's rmse: 483.795
[606]	training's rmse: 373.945	valid_1's rmse: 483.676
[607]	training's rmse: 373.622	valid_1's rmse: 483.319
[608]	training's rmse: 373.556	valid_1's rmse: 483.297
[609]	training's rmse: 373.535	valid_1's rmse: 483.27
[610]	training's rmse: 373.484	valid_1's rmse: 483.194
[611]	training's rmse: 373.454	valid_1's rmse: 483.137
[612]	training's rmse: 373.299	valid_1's rmse: 482.954
[613]	training's rmse: 373.232	valid_1's rmse: 482.808
[614]	training's rmse: 373.17	valid_1's rmse: 482.79
[615]	training's rmse: 373.136	valid_1's rmse: 482.87
[616]	training'

[896]	training's rmse: 337.465	valid_1's rmse: 431.685
[897]	training's rmse: 337.413	valid_1's rmse: 431.668
[898]	training's rmse: 337.025	valid_1's rmse: 430.906
[899]	training's rmse: 336.978	valid_1's rmse: 430.97
[900]	training's rmse: 336.955	valid_1's rmse: 430.948
[901]	training's rmse: 336.944	valid_1's rmse: 430.9
[902]	training's rmse: 336.925	valid_1's rmse: 430.901
[903]	training's rmse: 336.9	valid_1's rmse: 430.896
[904]	training's rmse: 336.836	valid_1's rmse: 430.821
[905]	training's rmse: 336.817	valid_1's rmse: 430.806
[906]	training's rmse: 336.796	valid_1's rmse: 430.8
[907]	training's rmse: 336.774	valid_1's rmse: 430.719
[908]	training's rmse: 336.744	valid_1's rmse: 430.623
[909]	training's rmse: 336.714	valid_1's rmse: 430.493
[910]	training's rmse: 336.192	valid_1's rmse: 429.381
[911]	training's rmse: 336.178	valid_1's rmse: 429.374
[912]	training's rmse: 335.781	valid_1's rmse: 428.585
[913]	training's rmse: 335.766	valid_1's rmse: 428.561
[914]	training's 

[1190]	training's rmse: 316.657	valid_1's rmse: 401.338
[1191]	training's rmse: 316.646	valid_1's rmse: 401.343
[1192]	training's rmse: 316.619	valid_1's rmse: 401.23
[1193]	training's rmse: 316.607	valid_1's rmse: 401.23
[1194]	training's rmse: 316.361	valid_1's rmse: 400.757
[1195]	training's rmse: 316.347	valid_1's rmse: 400.763
[1196]	training's rmse: 316.111	valid_1's rmse: 400.288
[1197]	training's rmse: 315.889	valid_1's rmse: 399.798
[1198]	training's rmse: 315.852	valid_1's rmse: 399.775
[1199]	training's rmse: 315.822	valid_1's rmse: 399.743
[1200]	training's rmse: 315.797	valid_1's rmse: 399.775
[1201]	training's rmse: 315.787	valid_1's rmse: 399.777
[1202]	training's rmse: 315.764	valid_1's rmse: 399.75
[1203]	training's rmse: 315.703	valid_1's rmse: 399.711
[1204]	training's rmse: 315.686	valid_1's rmse: 399.718
[1205]	training's rmse: 315.674	valid_1's rmse: 399.713
[1206]	training's rmse: 315.623	valid_1's rmse: 399.699
[1207]	training's rmse: 315.616	valid_1's rmse: 399

[1484]	training's rmse: 299.807	valid_1's rmse: 376.15
[1485]	training's rmse: 299.786	valid_1's rmse: 376.159
[1486]	training's rmse: 299.774	valid_1's rmse: 376.154
[1487]	training's rmse: 299.619	valid_1's rmse: 375.781
[1488]	training's rmse: 299.588	valid_1's rmse: 375.796
[1489]	training's rmse: 299.576	valid_1's rmse: 375.787
[1490]	training's rmse: 299.423	valid_1's rmse: 375.423
[1491]	training's rmse: 299.405	valid_1's rmse: 375.437
[1492]	training's rmse: 299.384	valid_1's rmse: 375.46
[1493]	training's rmse: 299.24	valid_1's rmse: 375.12
[1494]	training's rmse: 299.219	valid_1's rmse: 375.08
[1495]	training's rmse: 299.207	valid_1's rmse: 375.083
[1496]	training's rmse: 299.194	valid_1's rmse: 375.08
[1497]	training's rmse: 299.18	valid_1's rmse: 375.045
[1498]	training's rmse: 299.16	valid_1's rmse: 375.015
[1499]	training's rmse: 299.152	valid_1's rmse: 375.013
[1500]	training's rmse: 299.133	valid_1's rmse: 374.983
[1501]	training's rmse: 299.112	valid_1's rmse: 374.974


[1777]	training's rmse: 288.519	valid_1's rmse: 360.594
[1778]	training's rmse: 288.511	valid_1's rmse: 360.595
[1779]	training's rmse: 288.502	valid_1's rmse: 360.589
[1780]	training's rmse: 288.474	valid_1's rmse: 360.542
[1781]	training's rmse: 288.459	valid_1's rmse: 360.542
[1782]	training's rmse: 288.453	valid_1's rmse: 360.533
[1783]	training's rmse: 288.443	valid_1's rmse: 360.527
[1784]	training's rmse: 288.432	valid_1's rmse: 360.524
[1785]	training's rmse: 288.408	valid_1's rmse: 360.53
[1786]	training's rmse: 288.399	valid_1's rmse: 360.532
[1787]	training's rmse: 288.389	valid_1's rmse: 360.534
[1788]	training's rmse: 288.364	valid_1's rmse: 360.543
[1789]	training's rmse: 288.253	valid_1's rmse: 360.287
[1790]	training's rmse: 288.252	valid_1's rmse: 360.292
[1791]	training's rmse: 288.216	valid_1's rmse: 360.286
[1792]	training's rmse: 288.191	valid_1's rmse: 360.294
[1793]	training's rmse: 288.18	valid_1's rmse: 360.306
[1794]	training's rmse: 288.169	valid_1's rmse: 36

[2069]	training's rmse: 279.623	valid_1's rmse: 348.755
[2070]	training's rmse: 279.609	valid_1's rmse: 348.754
[2071]	training's rmse: 279.566	valid_1's rmse: 348.757
[2072]	training's rmse: 279.525	valid_1's rmse: 348.759
[2073]	training's rmse: 279.515	valid_1's rmse: 348.768
[2074]	training's rmse: 279.505	valid_1's rmse: 348.767
[2075]	training's rmse: 279.414	valid_1's rmse: 348.523
[2076]	training's rmse: 279.207	valid_1's rmse: 348.22
[2077]	training's rmse: 279.185	valid_1's rmse: 348.232
[2078]	training's rmse: 279.175	valid_1's rmse: 348.232
[2079]	training's rmse: 279.169	valid_1's rmse: 348.239
[2080]	training's rmse: 279.145	valid_1's rmse: 348.202
[2081]	training's rmse: 279.133	valid_1's rmse: 348.183
[2082]	training's rmse: 279.111	valid_1's rmse: 348.194
[2083]	training's rmse: 279.098	valid_1's rmse: 348.196
[2084]	training's rmse: 279.089	valid_1's rmse: 348.188
[2085]	training's rmse: 278.894	valid_1's rmse: 347.771
[2086]	training's rmse: 278.888	valid_1's rmse: 3

[2361]	training's rmse: 271.783	valid_1's rmse: 338.394
[2362]	training's rmse: 271.741	valid_1's rmse: 338.411
[2363]	training's rmse: 271.729	valid_1's rmse: 338.411
[2364]	training's rmse: 271.716	valid_1's rmse: 338.422
[2365]	training's rmse: 271.661	valid_1's rmse: 338.279
[2366]	training's rmse: 271.646	valid_1's rmse: 338.287
[2367]	training's rmse: 271.626	valid_1's rmse: 338.279
[2368]	training's rmse: 271.557	valid_1's rmse: 338.104
[2369]	training's rmse: 271.544	valid_1's rmse: 338.126
[2370]	training's rmse: 271.526	valid_1's rmse: 338.128
[2371]	training's rmse: 271.516	valid_1's rmse: 338.134
[2372]	training's rmse: 271.49	valid_1's rmse: 338.188
[2373]	training's rmse: 271.466	valid_1's rmse: 338.186
[2374]	training's rmse: 271.46	valid_1's rmse: 338.189
[2375]	training's rmse: 271.452	valid_1's rmse: 338.195
[2376]	training's rmse: 271.451	valid_1's rmse: 338.195
[2377]	training's rmse: 271.444	valid_1's rmse: 338.205
[2378]	training's rmse: 271.434	valid_1's rmse: 33

[2653]	training's rmse: 265.939	valid_1's rmse: 331.224
[2654]	training's rmse: 265.926	valid_1's rmse: 331.214
[2655]	training's rmse: 265.921	valid_1's rmse: 331.213
[2656]	training's rmse: 265.905	valid_1's rmse: 331.221
[2657]	training's rmse: 265.899	valid_1's rmse: 331.225
[2658]	training's rmse: 265.889	valid_1's rmse: 331.209
[2659]	training's rmse: 265.881	valid_1's rmse: 331.213
[2660]	training's rmse: 265.871	valid_1's rmse: 331.23
[2661]	training's rmse: 265.861	valid_1's rmse: 331.231
[2662]	training's rmse: 265.86	valid_1's rmse: 331.231
[2663]	training's rmse: 265.842	valid_1's rmse: 331.24
[2664]	training's rmse: 265.829	valid_1's rmse: 331.247
[2665]	training's rmse: 265.822	valid_1's rmse: 331.248
[2666]	training's rmse: 265.809	valid_1's rmse: 331.251
[2667]	training's rmse: 265.804	valid_1's rmse: 331.246
[2668]	training's rmse: 265.797	valid_1's rmse: 331.249
[2669]	training's rmse: 265.74	valid_1's rmse: 331.096
[2670]	training's rmse: 265.735	valid_1's rmse: 331.

[2945]	training's rmse: 260.332	valid_1's rmse: 324.931
[2946]	training's rmse: 260.326	valid_1's rmse: 324.926
[2947]	training's rmse: 260.319	valid_1's rmse: 324.935
[2948]	training's rmse: 260.315	valid_1's rmse: 324.937
[2949]	training's rmse: 260.308	valid_1's rmse: 324.913
[2950]	training's rmse: 260.295	valid_1's rmse: 324.927
[2951]	training's rmse: 260.286	valid_1's rmse: 324.926
[2952]	training's rmse: 260.245	valid_1's rmse: 324.802
[2953]	training's rmse: 260.236	valid_1's rmse: 324.8
[2954]	training's rmse: 260.226	valid_1's rmse: 324.803
[2955]	training's rmse: 260.22	valid_1's rmse: 324.802
[2956]	training's rmse: 260.215	valid_1's rmse: 324.813
[2957]	training's rmse: 260.117	valid_1's rmse: 324.646
[2958]	training's rmse: 260.109	valid_1's rmse: 324.648
[2959]	training's rmse: 260.102	valid_1's rmse: 324.645
[2960]	training's rmse: 260.097	valid_1's rmse: 324.656
[2961]	training's rmse: 260.09	valid_1's rmse: 324.659
[2962]	training's rmse: 259.945	valid_1's rmse: 324.

[3238]	training's rmse: 255.577	valid_1's rmse: 319.805
[3239]	training's rmse: 255.573	valid_1's rmse: 319.803
[3240]	training's rmse: 255.567	valid_1's rmse: 319.8
[3241]	training's rmse: 255.563	valid_1's rmse: 319.8
[3242]	training's rmse: 255.559	valid_1's rmse: 319.802
[3243]	training's rmse: 255.554	valid_1's rmse: 319.807
[3244]	training's rmse: 255.548	valid_1's rmse: 319.815
[3245]	training's rmse: 255.545	valid_1's rmse: 319.824
[3246]	training's rmse: 255.54	valid_1's rmse: 319.823
[3247]	training's rmse: 255.526	valid_1's rmse: 319.845
[3248]	training's rmse: 255.525	valid_1's rmse: 319.847
[3249]	training's rmse: 255.51	valid_1's rmse: 319.851
[3250]	training's rmse: 255.507	valid_1's rmse: 319.853
[3251]	training's rmse: 255.501	valid_1's rmse: 319.849
[3252]	training's rmse: 255.471	valid_1's rmse: 319.871
[3253]	training's rmse: 255.459	valid_1's rmse: 319.869
[3254]	training's rmse: 255.448	valid_1's rmse: 319.87
[3255]	training's rmse: 255.442	valid_1's rmse: 319.866

[3532]	training's rmse: 251.053	valid_1's rmse: 314.171
[3533]	training's rmse: 251.048	valid_1's rmse: 314.182
[3534]	training's rmse: 251.039	valid_1's rmse: 314.154
[3535]	training's rmse: 251.002	valid_1's rmse: 314.051
[3536]	training's rmse: 250.997	valid_1's rmse: 314.044
[3537]	training's rmse: 250.989	valid_1's rmse: 314.07
[3538]	training's rmse: 250.983	valid_1's rmse: 314.06
[3539]	training's rmse: 250.982	valid_1's rmse: 314.06
[3540]	training's rmse: 250.976	valid_1's rmse: 314.065
[3541]	training's rmse: 250.972	valid_1's rmse: 314.058
[3542]	training's rmse: 250.965	valid_1's rmse: 314.048
[3543]	training's rmse: 250.958	valid_1's rmse: 314.049
[3544]	training's rmse: 250.926	valid_1's rmse: 314.072
[3545]	training's rmse: 250.924	valid_1's rmse: 314.073
[3546]	training's rmse: 250.919	valid_1's rmse: 314.078
[3547]	training's rmse: 250.89	valid_1's rmse: 313.985
[3548]	training's rmse: 250.863	valid_1's rmse: 313.951
[3549]	training's rmse: 250.753	valid_1's rmse: 313.

[3825]	training's rmse: 246.889	valid_1's rmse: 309.155
[3826]	training's rmse: 246.855	valid_1's rmse: 309.074
[3827]	training's rmse: 246.85	valid_1's rmse: 309.073
[3828]	training's rmse: 246.843	valid_1's rmse: 309.075
[3829]	training's rmse: 246.812	valid_1's rmse: 308.988
[3830]	training's rmse: 246.809	valid_1's rmse: 308.993
[3831]	training's rmse: 246.802	valid_1's rmse: 308.993
[3832]	training's rmse: 246.796	valid_1's rmse: 308.993
[3833]	training's rmse: 246.794	valid_1's rmse: 308.99
[3834]	training's rmse: 246.788	valid_1's rmse: 308.992
[3835]	training's rmse: 246.778	valid_1's rmse: 308.991
[3836]	training's rmse: 246.751	valid_1's rmse: 308.924
[3837]	training's rmse: 246.672	valid_1's rmse: 308.737
[3838]	training's rmse: 246.645	valid_1's rmse: 308.756
[3839]	training's rmse: 246.641	valid_1's rmse: 308.762
[3840]	training's rmse: 246.636	valid_1's rmse: 308.761
[3841]	training's rmse: 246.629	valid_1's rmse: 308.753
[3842]	training's rmse: 246.618	valid_1's rmse: 30

9.740958452224731

In [46]:
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=0., reg_lambda=0.01, objective='mse', metric='rmse',
    max_depth=-1, learning_rate=0.05, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
)#定义LightGBM模型
model.fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)], early_stopping_rounds=100,
        eval_metric='rmse',
        # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
        categorical_feature=cate_feat,
        #sample_weight=data.loc[train_idx]['sample_weight'],
        verbose=100)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 855.934
[200]	valid_0's rmse: 808.956
[300]	valid_0's rmse: 775.974
[400]	valid_0's rmse: 747.833
[500]	valid_0's rmse: 723.967
[600]	valid_0's rmse: 708.145
[700]	valid_0's rmse: 697.435
[800]	valid_0's rmse: 687.287
[900]	valid_0's rmse: 677.249
[1000]	valid_0's rmse: 670.726
[1100]	valid_0's rmse: 664.16
[1200]	valid_0's rmse: 659.023
[1300]	valid_0's rmse: 654.743
[1400]	valid_0's rmse: 650.377
[1500]	valid_0's rmse: 647.241
[1600]	valid_0's rmse: 643.659
[1700]	valid_0's rmse: 641.159
[1800]	valid_0's rmse: 639.319
[1900]	valid_0's rmse: 637.346
[2000]	valid_0's rmse: 635.804
[2100]	valid_0's rmse: 633.931
[2200]	valid_0's rmse: 632.324
[2300]	valid_0's rmse: 630.803
[2400]	valid_0's rmse: 629.636
[2500]	valid_0's rmse: 628.666
[2600]	valid_0's rmse: 627.659
[2700]	valid_0's rmse: 627.142
[2800]	valid_0's rmse: 626.554
[2900]	valid_0's rmse: 625.456
[3000]	valid_0's rmse: 624.923
[3100]	valid_0's 

LGBMRegressor(bagging_fraction=0.75, bagging_freq=5, bagging_seed=7,
              boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.2, feature_fraction_seed=7,
              importance_type='split', learning_rate=0.01, max_bin=400,
              max_depth=-1, metric='rmse', min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=5000,
              n_jobs=-1, num_leaves=4, objective='regression',
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
              verbose=2)

In [47]:
Y_test = model.predict(X_test)
submission = pd.DataFrame({
    "id": submit_example['id'], 
    "forecastVolum": Y_test.round().astype(int)
})
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: 0 if x < 0 else x)#<0的结果变为0
submission.to_csv('rough_feature_engineer_popularity.csv', index=False)

## K折

In [54]:
from sklearn.metrics import mean_squared_error as mse
def get_predict_w(model, data, label='label', feature=[], cate_feature=[], random_state=2018, n_splits=5,
                  model_type='lgb'):
    if 'sample_weight' not in data.keys():
        data['sample_weight'] = 1
    model.random_state = random_state
    predict_label = 'predict_' + label#设定需要预测量的标签
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)#K折验证
    data[predict_label] = 0#设定预测量的初始值
    test_index = data[label].isnull()#(data[label].isnull()) | (data[label] == -1)#获得测试集索引，测试集的label都为NaN，所以利用data[label].isnull()实现索引
    train_data = data[~test_index].reset_index(drop=True)#获得训练集数据
    test_data = data[test_index]#获得测试集

    for train_idx, val_idx in kfold.split(train_data):#对训练集的K折分别训练
        model.random_state = model.random_state + 1
        #划分训练集
        train_x = train_data.loc[train_idx][feature]
        train_y = train_data.loc[train_idx][label]
        #划分测试集
        test_x = train_data.loc[val_idx][feature]
        test_y = train_data.loc[val_idx][label]
        if model_type == 'lgb':#LightGBM
            try:
                model.fit(train_x, train_y, eval_set=[(train_x, train_y),(test_x, test_y)], early_stopping_rounds=100,
                          eval_metric='mae',
                          # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)
            except:
                model.fit(train_x, train_y, eval_set=[(train_x, train_y),(test_x, test_y)], early_stopping_rounds=100,
                          eval_metric='mae',
                          # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          # categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)
        elif model_type == 'ctb':#CatBoost
            model.fit(train_x, train_y, eval_set=[(train_x, train_y),(test_x, test_y)], early_stopping_rounds=100,
                      # eval_metric='mae',
                      # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                      cat_features=cate_feature,
                      sample_weight=train_data.loc[train_idx]['sample_weight'],
                      verbose=100)
        train_data.loc[val_idx, predict_label] = model.predict(test_x)#预测交叉验证中的验证集，获得最终结果，用于后续loss结果评估
        if len(test_data) != 0:
            test_data[predict_label] = test_data[predict_label] + model.predict(test_data[feature])#叠加
    test_data[predict_label] = test_data[predict_label] / n_splits#平均获得最终结果
    print(mse(train_data[label], train_data[predict_label]) * 5, train_data[predict_label].mean(),
          test_data[predict_label].mean())

    return pd.concat([train_data, test_data], sort=True, ignore_index=True), predict_label

In [55]:
'''
features=['adcode', 'bodyType', 'carCommentVolum', 'carCommentVolum_last_3',
       'date_block_num', 'label', 'label_last_3','model', 'newsReplyVolum',
        'newsReplyVolum_last_3', 'popularity','popularity_last_3', 
        'predict_label', 'province_rank', 'regMonth','regYear']
'''
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=-1, learning_rate=0.05, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
)#定义LightGBM模型
data, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)#训练获得结果data和label

data['lgb'] = data[predict_label]#重命名结果标签

data['forecastVolum'] = data['lgb'].apply(lambda x: 0 if x < 0 else x)#<0的结果变为0
data[data.id!=0][['id', 'forecastVolum']].round().astype(int).to_csv('holiday_chafen_bifen.csv', index=False)#输出

Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 88.0262	valid_1's l1: 95.4989
[200]	training's l1: 63.5028	valid_1's l1: 74.7696
[300]	training's l1: 54.332	valid_1's l1: 68.8368
[400]	training's l1: 48.4052	valid_1's l1: 65.4118
[500]	training's l1: 44.5596	valid_1's l1: 63.7582
[600]	training's l1: 41.1671	valid_1's l1: 62.3876
[700]	training's l1: 38.3436	valid_1's l1: 61.2033
[800]	training's l1: 35.9465	valid_1's l1: 60.1217
[900]	training's l1: 34.0431	valid_1's l1: 59.5385
[1000]	training's l1: 32.26	valid_1's l1: 58.9341
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 32.26	valid_1's l1: 58.9341
Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 91.0708	valid_1's l1: 101.678
[200]	training's l1: 64.2045	valid_1's l1: 78.3854
[300]	training's l1: 54.4149	valid_1's l1: 71.3777
[400]	training's l1: 48.7652	valid_1's l1: 68.499
[500]	training's l1: 44.7417	valid_1's l1: 66.5639
[600]	training

In [37]:
yulao1=pd.read_csv('ccf_car_sales.csv')
yulao2=pd.read_csv('ccf_car_sales_lgb.csv')
submission=pd.read_csv('sales_feature_engineer_mean.csv')
#submission['forecastVolum']=((submission['forecastVolum']+yulao1['forecastVolum']+yulao2['forecastVolum'])/3.0).round().astype(int)
submission['forecastVolum']=((submission['forecastVolum']+yulao1['forecastVolum']+yulao2['forecastVolum'])/3.0)
####################融合结果的极端值剔除-【开始】###########################  
#处理融合后结果中的极端值。把太大的数值（降序排列时，位于顶部往下0.005的数值，就是只有0.005的数比它大）缩小一点（乘以0.77），把太小的数值（降序排列时，位于顶部往下0.99的数值）放大一点（乘以1.1）
q1 = submission['forecastVolum'].quantile(0.005)
q2 = submission['forecastVolum'].quantile(0.995)
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: x if x > q1 else x*0.77)
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: x if x < q2 else x*1.1)
####################融合结果的极端值剔除-【结束】###########################
submission['forecastVolum']=submission['forecastVolum'].round().astype(int)
submission.to_csv('stacking_three.csv', index=False)

## stacking

In [56]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet,Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from mlxtend.regressor import StackingRegressor
import xgboost as xgb
import lightgbm as lgb

In [57]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def xgb_Regressor(train_x, train_y, val_x, val_y, train_X, y):
    xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=1200,
                             reg_alpha=0.1640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
    xgb_model.fit(train_x, train_y)
    pred_val = xgb_model.predict(val_x)
    score = rmsle(val_y, pred_val)
    xgb_model.fit(train_X, y)
    
    return xgb_model, score, pred_val

def lgb_Regressor(train_x, train_y, val_x, val_y, train_X, y):
    lgb_model = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=360,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 10)
    lgb_model.fit(train_x, train_y)
    pred_val = lgb_model.predict(val_x)
    score = rmsle(val_y, pred_val)
    lgb_model.fit(train_X, y)
    
    return lgb_model, score, pred_val
def base_model():
    ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

    return ENet,lasso

def gboost_Regressor(train_x, train_y, val_x, val_y, train_X, y):
    gb_model = GradientBoostingRegressor(n_estimators=3600, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=20, min_samples_split=20, 
                    loss='huber', random_state =5)
    gb_model.fit(train_x, train_y)
    pred_val = gb_model.predict(val_x)
    score = rmsle(val_y, pred_val)
    gb_model.fit(train_X, y)
    
    return gb_model, score, pred_val

In [58]:
print("XGBOOSTRegressor开始训练...")
xgb_model, score, xgb_train_pred = xgb_Regressor(X_train, Y_train, X_valid, Y_valid, X_train_set, Y_train_set)
print(score)
xgb_pred = xgb_model.predict(X_test)

XGBOOSTRegressor开始训练...
234.13587165511532


In [76]:
print("LGBMRegressor开始训练...")
lgb_model,score, lgb_train_pred= lgb_Regressor(X_train, Y_train, X_valid, Y_valid, X_train_set, Y_train_set)
print(score)
lgb_pred = lgb_model.predict(X_test)

LGBMRegressor开始训练...
366.51211586506867


In [63]:
X_train_gbdt=X_train.fillna(0).astype('float32')
Y_train_gbdt=Y_train.fillna(0).astype('float32')
X_valid_gbdt=X_valid.fillna(0).astype('float32')
Y_valid_gbdt=Y_valid.fillna(0).astype('float32')
X_train_set_gbdt=X_train_set.fillna(0).astype('float32')
Y_train_set_gbdt=Y_train_set.fillna(0).astype('float32')
X_test_gbdt=X_test.fillna(0).astype('float32')

In [75]:
print("GDBTRegressor开始训练...")
gb_model,score, gb_train_pred = gboost_Regressor(X_train, Y_train, X_valid, Y_valid, X_train_set, Y_train_set)
print( score)
gb_pred = gb_model.predict(X_test)

GDBTRegressor开始训练...


AttributeError: 'GradientBoostingRegressor' object has no attribute 'estimators_'

In [116]:
ENet,lasso= base_model()
    
ENet.fit(X_train_gbdt, Y_train_gbdt)
enet_pred = ENet.predict(X_valid_gbdt)
print("ENet:", rmsle(Y_valid_gbdt, enet_pred))
    
lasso.fit(X_train_gbdt, Y_train_gbdt)
lasso_pred = lasso.predict(X_valid_gbdt)
print("lasso:", rmsle(Y_valid_gbdt, lasso_pred))

ENet: 469.8993032498972
lasso: 469.9097423047175


In [117]:
# ####Stacking####
print('Stacking...')
stacked_averaged_models = StackingRegressor(
regressors=[lgb_model, gb_model],
meta_regressor= xgb_model
)
stacked_averaged_models.fit(X_train_gbdt, Y_train_gbdt)
stacked_train_pred = stacked_averaged_models.predict(X_valid_gbdt)
stacked_averaged_models.fit(X_train_set_gbdt, Y_train_set_gbdt)
stacked_pred = stacked_averaged_models.predict(X_test.values)
print(rmsle(Y_valid, stacked_train_pred))
print(rmsle(Y_valid, stacked_train_pred*0.30 + gb_train_pred*0.30 + 
       lgb_train_pred*0.20 + xgb_train_pred*0.20))
ensemble = stacked_pred*0.30 + gb_pred*0.30  + lgb_pred*0.20 + xgb_pred*0.20

Stacking...
273.50445093566856
257.8798806512508


In [119]:
submission = pd.DataFrame({
    "id": submit_example['id'], 
    "forecastVolum": ensemble.round().astype(int)
})
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: 0 if x < 0 else x)#<0的结果变为0
submission.to_csv('xgb_lgb_gbdt_enet_lasso_stacking.csv', index=False)

In [120]:
stacked_three=pd.read_csv('stacking_three_0_49623.csv')
submission=pd.read_csv('xgb_lgb_gbdt_enet_lasso_stacking.csv')
#submission['forecastVolum']=((submission['forecastVolum']+yulao1['forecastVolum']+yulao2['forecastVolum'])/3.0).round().astype(int)
submission['forecastVolum']=submission['forecastVolum']*0.4+stacked_three['forecastVolum']*0.6
####################融合结果的极端值剔除-【开始】###########################  
#处理融合后结果中的极端值。把太大的数值（降序排列时，位于顶部往下0.005的数值，就是只有0.005的数比它大）缩小一点（乘以0.77），把太小的数值（降序排列时，位于顶部往下0.99的数值）放大一点（乘以1.1）
q1 = submission['forecastVolum'].quantile(0.005)
q2 = submission['forecastVolum'].quantile(0.995)
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: x if x > q1 else x*0.77)
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: x if x < q2 else x*1.1)
####################融合结果的极端值剔除-【结束】###########################
submission['forecastVolum']=submission['forecastVolum'].round().astype(int)
submission.to_csv('stacking_stackingAndThree.csv', index=False)

In [80]:
X_test = data[data.date_block_num >= 25].sort_values(by='id')[features]
xgb_pred = xgb_model.predict(X_test)
submission = pd.DataFrame({
    "id": submit_example['id'], 
    "forecastVolum": xgb_pred.round().astype(int)
})
submission['forecastVolum'] = submission['forecastVolum'].apply(lambda x: 0 if x < 0 else x)#<0的结果变为0
submission.to_csv('lgb_holiday_chabi.csv', index=False)