In [79]:
import numpy as np
import xgboost as xgb
import pandas as pd
from IPython.display import display #方便对数据进行可视化
import time
from datetime import datetime

In [80]:
def rmspe(y, y_hat):
    #由于sales取了对数，这里要还原
    y = np.exp(y)
    y_hat = np.exp(y_hat)
    
    #由于有yi=0的情况，直接计算会得出无穷大的结果，所以需要处理一下
    #定义一个y_rev来表示y的倒数，y为0时y_rev也赋值为0（因为在项目中提到“ Any day and store with 0 sales is ignored in scoring.“）
    y_rev = np.zeros(y.shape, dtype = float)
    n_zero = y != 0
    y_rev[n_zero] = 1./y[n_zero]

    rmspe = np.sqrt(np.mean(((y - y_hat)*y_rev)**2))
    return rmspe


#自定义一个评价函数

def rmspe_feval(y_hat, dy):
    y = dy.get_label()
    rmspe_score = rmspe(y, y_hat)
    return 'rmspe', rmspe_score


#定义训练函数
def train(param, dtrain, dval, num_round, feval_c, stopping):
    train_time = time.clock()

    model = xgb.train(
        param, dtrain, num_round, feval = feval_c, 
        evals = [(dtrain, 'train'), (dval, 'val')], early_stopping_rounds = stopping, 
        verbose_eval = True
    )

    train_time = time.clock() - train_time

    print('The training time of the model is: {:.0f}s'.format(train_time))
    
    return model

#定义一个打分函数
def score(bst, dval):
    y_pred = bst.predict(dval)
#print(y_pred)
    y_val = dval.get_label()
    score = rmspe(y_val, y_pred)
#print(score)
    print('The rmspe of the model on validation data set is {:.6f}'.format(score))
    pass

从论坛上看到一些方法，可以改进特征工程的处理。

In [81]:
# 导入训练数据
train_data = pd.read_csv(
    "train.csv", 
    parse_dates = True,
    low_memory=False, index_col = 'Date')

store_features = pd.read_csv("store.csv")
# 导入测试数据
test_data = pd.read_csv("test.csv",
                       parse_dates = True,
    low_memory=False, index_col = 'Date')

#只使用开门的数据
train_data = train_data[train_data['Open'] == 1]
test_data = test_data[test_data['Open'] == 1]



In [82]:
train_data['year'] = train_data.index.year
train_data['month'] = train_data.index.month
train_data['day'] = train_data.index.day

test_data['year'] = test_data.index.year
test_data['month'] = test_data.index.month
test_data['day'] = test_data.index.day

In [83]:
train_data['dayofyear'] = train_data.index.dayofyear
test_data['dayofyear'] = test_data.index.dayofyear

train_data['weekofyear'] = train_data.index.weekofyear
test_data['weekofyear'] = test_data.index.weekofyear

In [84]:
display(train_data.head(n=5))
display(test_data.head(n=1))
display(store_features.head(n=1))

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,year,month,day,dayofyear,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-07-31,1,5,5263,555,1,1,0,1,2015,7,31,212,31
2015-07-31,2,5,6064,625,1,1,0,1,2015,7,31,212,31
2015-07-31,3,5,8314,821,1,1,0,1,2015,7,31,212,31
2015-07-31,4,5,13995,1498,1,1,0,1,2015,7,31,212,31
2015-07-31,5,5,4822,559,1,1,0,1,2015,7,31,212,31


Unnamed: 0_level_0,Id,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,year,month,day,dayofyear,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-09-17,1,1,4,1.0,1,0,0,2015,9,17,260,38


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,


In [85]:
train_data.loc['2015-07-31']

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,year,month,day,dayofyear,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-07-31,1,5,5263,555,1,1,0,1,2015,7,31,212,31
2015-07-31,2,5,6064,625,1,1,0,1,2015,7,31,212,31
2015-07-31,3,5,8314,821,1,1,0,1,2015,7,31,212,31
2015-07-31,4,5,13995,1498,1,1,0,1,2015,7,31,212,31
2015-07-31,5,5,4822,559,1,1,0,1,2015,7,31,212,31
2015-07-31,6,5,5651,589,1,1,0,1,2015,7,31,212,31
2015-07-31,7,5,15344,1414,1,1,0,1,2015,7,31,212,31
2015-07-31,8,5,8492,833,1,1,0,1,2015,7,31,212,31
2015-07-31,9,5,8565,687,1,1,0,1,2015,7,31,212,31
2015-07-31,10,5,7185,681,1,1,0,1,2015,7,31,212,31


In [71]:
#此处增加一步，先将验证集划分出来
#打开数据文件，很容易找到，2015/1/24-2015/7/31时间范围内的数据编号为1 - 210736
val_data = train_data[datetime(2015, 1, 24):datetime(2015, 7, 31)]
train_data = train_data[datetime(2013, 1, 1):datetime(2015, 1, 23)]

display(val_data.head(n=1))

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,year,month,day,dayofyear,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


In [72]:
#融合
train_data = pd.merge(train_data, store_features, on = 'Store')
val_data = pd.merge(val_data, store_features, on = 'Store')
test_data = pd.merge(test_data, store_features, on = 'Store')

train_data = train_data.drop('Customers', axis = 1)
sales_train = train_data['Sales']
features_train = train_data.drop('Sales', axis = 1)

val_data = val_data.drop('Customers', axis = 1)
sales_val = val_data['Sales']
features_val = val_data.drop('Sales', axis = 1)

features_test = test_data.drop('Id', axis = 1)

# 对字符串特征进行独热编码
category = ['StoreType', 'Assortment', 'StateHoliday', 'PromoInterval']


#首先将训练集、验证集、测试集合并，然后统一独热编码，然后再拆分为训练集、验证集、测试集

print('合并前：')
print(features_train.shape[0])
print(features_val.shape[0])
print(features_test.shape[0])

data_merge = pd.concat([features_train, features_val], ignore_index = True)
data_merge = pd.concat([data_merge, features_test], ignore_index = True)
data_merge_pd = pd.get_dummies(data_merge, columns = category)

# 需要把NA/NAN的数据转化成0
data_merge_pd.fillna(0, inplace = True)

#拆分

features_train = data_merge_pd[ : features_train.shape[0]]
features_val = data_merge_pd[features_train.shape[0] : (features_val.shape[0] + features_train.shape[0])]
features_test = data_merge_pd[ (features_val.shape[0] + features_train.shape[0]) : ]

print('合并后：')
print(features_train.shape[0])
print(features_val.shape[0])
print(features_test.shape[0])

display(features_train.head(n=1))
display(features_val.head(n=1))
display(features_test.head(n=1))

#下面将sales做log处理

sales_train_log = np.log(sales_train) #因为去掉了为0的销售数据，所以不用+1了
sales_val_log = np.log(sales_val)

dtrain = xgb.DMatrix(features_train, label = sales_train_log)

dval = xgb.DMatrix(features_val, label = sales_val_log)

合并前：
0
0
35093
合并后：
0
0
35093


Unnamed: 0,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,DayOfWeek,Open,Promo,Promo2,Promo2SinceWeek,Promo2SinceYear,SchoolHoliday,...,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,StateHoliday_0,StateHoliday_a,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec"


Unnamed: 0,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,DayOfWeek,Open,Promo,Promo2,Promo2SinceWeek,Promo2SinceYear,SchoolHoliday,...,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,StateHoliday_0,StateHoliday_a,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec"


Unnamed: 0,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,DayOfWeek,Open,Promo,Promo2,Promo2SinceWeek,Promo2SinceYear,SchoolHoliday,...,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,StateHoliday_0,StateHoliday_a,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec"
0,1270.0,9.0,2008.0,4,1.0,1,0,0.0,0.0,0,...,1,0,1,0,0,1,0,0,0,0


In [34]:
#设定参数
param = {'max_depth': 10, 'eta': 0.3, 'silent': 0, 'objective': 'reg:linear', 'min_child_weight': 6, 'colsample_bytree': 0.3, 'subsample':0.3}

#设定迭代次数

num_round = 100000
stopping = 100

#训练并打分
model = train(param, dtrain, dval, num_round, rmspe_feval, stopping)
score(model, dval)

[0]	train-rmse:5.79	val-rmse:5.83197	train-rmspe:0.996609	val-rmspe:0.996766
Multiple eval metrics have been passed: 'val-rmspe' will be used for early stopping.

Will train until val-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:4.06176	val-rmse:4.10128	train-rmspe:0.981025	val-rmspe:0.981869
[2]	train-rmse:2.85607	val-rmse:2.89386	train-rmspe:0.936859	val-rmspe:0.939547
[3]	train-rmse:2.01564	val-rmse:2.05564	train-rmspe:0.854528	val-rmspe:0.86077
[4]	train-rmse:1.43131	val-rmse:1.47341	train-rmspe:0.742221	val-rmspe:0.753271
[5]	train-rmse:1.03057	val-rmse:1.06602	train-rmspe:0.619714	val-rmspe:0.631911
[6]	train-rmse:0.756296	val-rmse:0.789472	train-rmspe:0.509557	val-rmspe:0.521228
[7]	train-rmse:0.570189	val-rmse:0.600734	train-rmspe:0.422828	val-rmspe:0.430695
[8]	train-rmse:0.455555	val-rmse:0.481832	train-rmspe:0.371125	val-rmspe:0.370215
[9]	train-rmse:0.385175	val-rmse:0.407745	train-rmspe:0.343107	val-rmspe:0.336967
[10]	train-rmse:0.33158	val-rmse:0.351075	train-rmsp

[98]	train-rmse:0.128775	val-rmse:0.162936	train-rmspe:0.194748	val-rmspe:0.180629
[99]	train-rmse:0.128717	val-rmse:0.162871	train-rmspe:0.195092	val-rmspe:0.180582
[100]	train-rmse:0.128269	val-rmse:0.163149	train-rmspe:0.195105	val-rmspe:0.180823
[101]	train-rmse:0.128181	val-rmse:0.163309	train-rmspe:0.195238	val-rmspe:0.181062
[102]	train-rmse:0.128147	val-rmse:0.163301	train-rmspe:0.195362	val-rmspe:0.181045
[103]	train-rmse:0.127936	val-rmse:0.16336	train-rmspe:0.195214	val-rmspe:0.180492
[104]	train-rmse:0.127055	val-rmse:0.162907	train-rmspe:0.194539	val-rmspe:0.180199
[105]	train-rmse:0.125671	val-rmse:0.161925	train-rmspe:0.19368	val-rmspe:0.179156
[106]	train-rmse:0.125659	val-rmse:0.161923	train-rmspe:0.19374	val-rmspe:0.179132
[107]	train-rmse:0.12551	val-rmse:0.161413	train-rmspe:0.193531	val-rmspe:0.178507
[108]	train-rmse:0.124498	val-rmse:0.160676	train-rmspe:0.192809	val-rmspe:0.1778
[109]	train-rmse:0.124168	val-rmse:0.160398	train-rmspe:0.19142	val-rmspe:0.17759
[1

[197]	train-rmse:0.109818	val-rmse:0.154749	train-rmspe:0.157816	val-rmspe:0.170734
[198]	train-rmse:0.109715	val-rmse:0.154597	train-rmspe:0.157776	val-rmspe:0.170548
[199]	train-rmse:0.109348	val-rmse:0.154395	train-rmspe:0.157533	val-rmspe:0.170587
[200]	train-rmse:0.109321	val-rmse:0.15441	train-rmspe:0.157519	val-rmspe:0.170553
[201]	train-rmse:0.109274	val-rmse:0.154183	train-rmspe:0.157521	val-rmspe:0.170386
[202]	train-rmse:0.109192	val-rmse:0.154203	train-rmspe:0.157416	val-rmspe:0.17041
[203]	train-rmse:0.10917	val-rmse:0.154223	train-rmspe:0.157301	val-rmspe:0.170399
[204]	train-rmse:0.108932	val-rmse:0.154029	train-rmspe:0.15841	val-rmspe:0.170058
[205]	train-rmse:0.10889	val-rmse:0.153997	train-rmspe:0.159701	val-rmspe:0.170038
[206]	train-rmse:0.108311	val-rmse:0.153632	train-rmspe:0.159202	val-rmspe:0.169609
[207]	train-rmse:0.108285	val-rmse:0.153613	train-rmspe:0.159174	val-rmspe:0.169576
[208]	train-rmse:0.107514	val-rmse:0.153104	train-rmspe:0.158758	val-rmspe:0.1688

[296]	train-rmse:0.100557	val-rmse:0.150755	train-rmspe:0.148623	val-rmspe:0.165839
[297]	train-rmse:0.100525	val-rmse:0.150768	train-rmspe:0.14859	val-rmspe:0.165839
[298]	train-rmse:0.100311	val-rmse:0.150596	train-rmspe:0.148405	val-rmspe:0.165643
[299]	train-rmse:0.100104	val-rmse:0.150427	train-rmspe:0.148317	val-rmspe:0.16554
[300]	train-rmse:0.100034	val-rmse:0.150367	train-rmspe:0.148211	val-rmspe:0.165369
[301]	train-rmse:0.09998	val-rmse:0.150351	train-rmspe:0.14718	val-rmspe:0.165316
[302]	train-rmse:0.099895	val-rmse:0.150317	train-rmspe:0.14737	val-rmspe:0.165304
[303]	train-rmse:0.099845	val-rmse:0.150524	train-rmspe:0.147285	val-rmspe:0.16558
[304]	train-rmse:0.099008	val-rmse:0.150046	train-rmspe:0.146538	val-rmspe:0.165014
[305]	train-rmse:0.099004	val-rmse:0.150092	train-rmspe:0.14654	val-rmspe:0.165033
[306]	train-rmse:0.098849	val-rmse:0.149989	train-rmspe:0.14772	val-rmspe:0.164476
[307]	train-rmse:0.098784	val-rmse:0.149981	train-rmspe:0.147655	val-rmspe:0.164428


[395]	train-rmse:0.094375	val-rmse:0.14916	train-rmspe:0.135037	val-rmspe:0.162878
[396]	train-rmse:0.094311	val-rmse:0.149185	train-rmspe:0.135108	val-rmspe:0.162901
[397]	train-rmse:0.094307	val-rmse:0.149174	train-rmspe:0.135111	val-rmspe:0.162893
[398]	train-rmse:0.094214	val-rmse:0.149139	train-rmspe:0.135047	val-rmspe:0.162931
[399]	train-rmse:0.094123	val-rmse:0.149172	train-rmspe:0.135108	val-rmspe:0.162954
[400]	train-rmse:0.093978	val-rmse:0.149068	train-rmspe:0.135039	val-rmspe:0.162946
[401]	train-rmse:0.093952	val-rmse:0.149053	train-rmspe:0.135029	val-rmspe:0.162934
[402]	train-rmse:0.09392	val-rmse:0.149042	train-rmspe:0.135053	val-rmspe:0.162931
[403]	train-rmse:0.0939	val-rmse:0.149058	train-rmspe:0.135023	val-rmspe:0.162931
[404]	train-rmse:0.093856	val-rmse:0.149034	train-rmspe:0.135021	val-rmspe:0.162896
[405]	train-rmse:0.093851	val-rmse:0.149069	train-rmspe:0.135042	val-rmspe:0.162893
[406]	train-rmse:0.093828	val-rmse:0.148894	train-rmspe:0.13502	val-rmspe:0.1628

In [36]:
score(model, dval)

The rmspe of the model on validation data set is 0.163069
