In [1]:
import numpy as np
import xgboost as xgb
import pandas as pd
from IPython.display import display #方便对数据进行可视化
import time
import datetime

In [2]:
def rmspe(y, y_hat):
    #由于sales取了对数，这里要还原
    y = np.exp(y)
    y_hat = np.exp(y_hat)
    
    #由于有yi=0的情况，直接计算会得出无穷大的结果，所以需要处理一下
    #定义一个y_rev来表示y的倒数，y为0时y_rev也赋值为0（因为在项目中提到“ Any day and store with 0 sales is ignored in scoring.“）
    y_rev = np.zeros(y.shape, dtype = float)
    n_zero = y != 0
    y_rev[n_zero] = 1./y[n_zero]

    rmspe = np.sqrt(np.mean(((y - y_hat)*y_rev)**2))
    return rmspe


#自定义一个评价函数

def rmspe_feval(y_hat, dy):
    y = dy.get_label()
    rmspe_score = rmspe(y, y_hat)
    return 'rmspe', rmspe_score


#定义训练函数
def train(param, dtrain, num_round, feval_c, stopping):
    train_time = time.clock()

    model = xgb.train(
        param, dtrain, num_round, feval = feval_c, 
        evals = [(dtrain, 'train')], early_stopping_rounds = stopping, 
        verbose_eval = True
    )

    train_time = time.clock() - train_time

    print('The training time of the model is: {:.0f}s'.format(train_time))
    
    return model

#定义一个打分函数
def score(bst, dval):
    y_pred = bst.predict(dval)
#print(y_pred)
    y_val = dval.get_label()
    score = rmspe(y_val, y_pred)
#print(score)
    print('The rmspe of the model on validation data set is {:.6f}'.format(score))
    pass

In [3]:
#从头开始处理数据

# 导入训练数据
train_data = pd.read_csv(
    "train.csv", 
    low_memory=False)

store_features = pd.read_csv("store.csv")
# 导入测试数据
test_data = pd.read_csv("test.csv")

train_data['year'] = train_data['Date'].apply(lambda x: float(x.split('-')[0]))
train_data['month'] = train_data['Date'].apply(lambda x: float(x.split('-')[1]))
train_data['day'] = train_data['Date'].apply(lambda x: float(x.split('-')[2]))

test_data['year'] = test_data['Date'].apply(lambda x: float(x.split('-')[0]))
test_data['month'] = test_data['Date'].apply(lambda x: float(x.split('-')[1]))
test_data['day'] = test_data['Date'].apply(lambda x: float(x.split('-')[2]))




In [4]:
#此处增加一步，先将验证集划分出来
#打开数据文件，很容易找到，2015/1/24-2015/7/31时间范围内的数据编号为1 - 210736

val_data = train_data[: 210735]
train_data = train_data[210735:]

#只使用sales不是0的数据
val_data = val_data[val_data['Sales'] != 0]
train_data = train_data[train_data['Sales'] != 0]


#融合
train_data = pd.merge(train_data, store_features, on = 'Store')
val_data = pd.merge(val_data, store_features, on = 'Store')
test_data = pd.merge(test_data, store_features, on = 'Store')

train_data = train_data.drop('Customers', axis = 1)
sales_train = train_data['Sales']
features_train = train_data.drop('Sales', axis = 1)

val_data = val_data.drop('Customers', axis = 1)
sales_val = val_data['Sales']
features_val = val_data.drop('Sales', axis = 1)

features_test = test_data.drop('Id', axis = 1)

# 对字符串特征进行独热编码
category = ['StoreType', 'Assortment', 'PromoInterval', 'StateHoliday']


#首先将训练集、验证集、测试集合并，然后统一独热编码，然后再拆分为训练集、验证集、测试集

print('合并前：')
print(features_train.shape[0])
print(features_val.shape[0])
print(features_test.shape[0])

data_merge = pd.concat([features_train, features_val], ignore_index = True)
data_merge = pd.concat([data_merge, features_test], ignore_index = True)
data_merge_pd = pd.get_dummies(data_merge, columns = category)

#把时间数据转化为一年中的第几周

data_merge_pd['Date'] = pd.to_datetime(data_merge_pd['Date'])
data_merge_pd['Dayofyear'] = data_merge_pd['Date'].dt.dayofyear


# 需要把NA/NAN的数据转化成0
data_merge_pd.fillna(0, inplace = True)



data_merge_pd = data_merge_pd.drop('Date', axis = 1)



#拆分

features_train = data_merge_pd[ : features_train.shape[0]]
features_val = data_merge_pd[features_train.shape[0] : (features_val.shape[0] + features_train.shape[0])]
features_test = data_merge_pd[ (features_val.shape[0] + features_train.shape[0]) : ]

print('合并后：')
print(features_train.shape[0])
print(features_val.shape[0])
print(features_test.shape[0])



#下面将sales做log处理

sales_train_log = np.log(sales_train) #因为去掉了为0的销售数据，所以不用+1了
sales_val_log = np.log(sales_val)

dtrain = xgb.DMatrix(features_train, label = sales_train_log)

dval = xgb.DMatrix(features_val, label = sales_val_log)





合并前：
669257
175081
41088
合并后：
669257
175081
41088


In [None]:
#设定参数
param = {'max_depth': 10, 'eta': 0.3, 'silent': 0, 'objective': 'reg:linear', 'min_child_weight': 6, 'colsample_bytree': 0.8, 'subsample':0.9}

#设定迭代次数

num_round = 100000
stopping = 100

#训练并打分
model = train(param, dtrain, num_round, rmspe_feval, stopping)
score(model, dval)

[0]	train-rmse:5.7891	train-rmspe:0.996635
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:4.05931	train-rmspe:0.981283
[2]	train-rmse:2.85058	train-rmspe:0.937849
[3]	train-rmse:2.00757	train-rmspe:0.856442
[4]	train-rmse:1.42151	train-rmspe:0.743922
[5]	train-rmse:1.01565	train-rmspe:0.619834
[6]	train-rmse:0.739707	train-rmspe:0.505168
[7]	train-rmse:0.552377	train-rmspe:0.414403
[8]	train-rmse:0.429867	train-rmspe:0.353426
[9]	train-rmse:0.349745	train-rmspe:0.316834
[10]	train-rmse:0.302827	train-rmspe:0.301291
[11]	train-rmse:0.275441	train-rmspe:0.295349
[12]	train-rmse:0.253388	train-rmspe:0.289828
[13]	train-rmse:0.230576	train-rmspe:0.28131
[14]	train-rmse:0.224466	train-rmspe:0.279845
[15]	train-rmse:0.214092	train-rmspe:0.27502
[16]	train-rmse:0.208718	train-rmspe:0.27178
[17]	train-rmse:0.204188	train-rmspe:0.270336
[18]	train-rmse:0.201141	train-rmspe:0.26690

[175]	train-rmse:0.083568	train-rmspe:0.113543
[176]	train-rmse:0.083526	train-rmspe:0.113499
[177]	train-rmse:0.083438	train-rmspe:0.11333
[178]	train-rmse:0.083375	train-rmspe:0.113253
[179]	train-rmse:0.083315	train-rmspe:0.113205
[180]	train-rmse:0.083228	train-rmspe:0.112644
[181]	train-rmse:0.08302	train-rmspe:0.112347
[182]	train-rmse:0.082928	train-rmspe:0.112125
[183]	train-rmse:0.08277	train-rmspe:0.11202
[184]	train-rmse:0.082648	train-rmspe:0.111786
[185]	train-rmse:0.082532	train-rmspe:0.111698
[186]	train-rmse:0.082446	train-rmspe:0.111208
[187]	train-rmse:0.082371	train-rmspe:0.111102
[188]	train-rmse:0.082287	train-rmspe:0.111011
[189]	train-rmse:0.082201	train-rmspe:0.11094
[190]	train-rmse:0.082096	train-rmspe:0.110541
[191]	train-rmse:0.081991	train-rmspe:0.110334
[192]	train-rmse:0.081935	train-rmspe:0.105182
[193]	train-rmse:0.081806	train-rmspe:0.104991
[194]	train-rmse:0.081681	train-rmspe:0.104827
[195]	train-rmse:0.081553	train-rmspe:0.104704
[196]	train-rmse:0

[351]	train-rmse:0.069233	train-rmspe:0.074947
[352]	train-rmse:0.069171	train-rmspe:0.074951
[353]	train-rmse:0.069081	train-rmspe:0.07483
[354]	train-rmse:0.069037	train-rmspe:0.074755
[355]	train-rmse:0.06897	train-rmspe:0.074663
[356]	train-rmse:0.068924	train-rmspe:0.074609
[357]	train-rmse:0.068855	train-rmspe:0.074516
[358]	train-rmse:0.068802	train-rmspe:0.074456
[359]	train-rmse:0.068762	train-rmspe:0.074398
[360]	train-rmse:0.068714	train-rmspe:0.074341
[361]	train-rmse:0.068664	train-rmspe:0.07425
[362]	train-rmse:0.068607	train-rmspe:0.073911
[363]	train-rmse:0.068561	train-rmspe:0.073866
[364]	train-rmse:0.068496	train-rmspe:0.073777
[365]	train-rmse:0.068437	train-rmspe:0.07369
[366]	train-rmse:0.068369	train-rmspe:0.073628
[367]	train-rmse:0.068322	train-rmspe:0.07358
[368]	train-rmse:0.068266	train-rmspe:0.073467
[369]	train-rmse:0.06825	train-rmspe:0.073452
[370]	train-rmse:0.068181	train-rmspe:0.073343
[371]	train-rmse:0.06814	train-rmspe:0.073286
[372]	train-rmse:0.0

[527]	train-rmse:0.061278	train-rmspe:0.064052
[528]	train-rmse:0.061239	train-rmspe:0.064001
[529]	train-rmse:0.061222	train-rmspe:0.063984
[530]	train-rmse:0.061199	train-rmspe:0.063961
[531]	train-rmse:0.061161	train-rmspe:0.063923
[532]	train-rmse:0.061121	train-rmspe:0.063879
[533]	train-rmse:0.061097	train-rmspe:0.063856
[534]	train-rmse:0.061054	train-rmspe:0.063807
[535]	train-rmse:0.061012	train-rmspe:0.063757
[536]	train-rmse:0.060972	train-rmspe:0.063654
[537]	train-rmse:0.060931	train-rmspe:0.063607
[538]	train-rmse:0.060898	train-rmspe:0.063559
[539]	train-rmse:0.060858	train-rmspe:0.063514
[540]	train-rmse:0.060829	train-rmspe:0.063483
[541]	train-rmse:0.060795	train-rmspe:0.06345
[542]	train-rmse:0.060753	train-rmspe:0.063392
[543]	train-rmse:0.060717	train-rmspe:0.063355
[544]	train-rmse:0.060694	train-rmspe:0.063332
[545]	train-rmse:0.060645	train-rmspe:0.063282
[546]	train-rmse:0.06062	train-rmspe:0.063256
[547]	train-rmse:0.060577	train-rmspe:0.063214
[548]	train-rms

[703]	train-rmse:0.055602	train-rmspe:0.05733
[704]	train-rmse:0.055581	train-rmspe:0.057312
[705]	train-rmse:0.05556	train-rmspe:0.05729
[706]	train-rmse:0.055535	train-rmspe:0.057262
[707]	train-rmse:0.055512	train-rmspe:0.057234
[708]	train-rmse:0.055485	train-rmspe:0.057206
[709]	train-rmse:0.055465	train-rmspe:0.057178
[710]	train-rmse:0.055428	train-rmspe:0.057136
[711]	train-rmse:0.055397	train-rmspe:0.057104
[712]	train-rmse:0.055372	train-rmspe:0.057079
[713]	train-rmse:0.055354	train-rmspe:0.05706
[714]	train-rmse:0.055319	train-rmspe:0.057005
[715]	train-rmse:0.05529	train-rmspe:0.056976
[716]	train-rmse:0.055263	train-rmspe:0.056948
[717]	train-rmse:0.05523	train-rmspe:0.056915
[718]	train-rmse:0.055211	train-rmspe:0.056896
[719]	train-rmse:0.055194	train-rmspe:0.056875
[720]	train-rmse:0.055156	train-rmspe:0.056815
[721]	train-rmse:0.055124	train-rmspe:0.056783
[722]	train-rmse:0.055086	train-rmspe:0.056737
[723]	train-rmse:0.055063	train-rmspe:0.056709
[724]	train-rmse:0.

[879]	train-rmse:0.050999	train-rmspe:0.052205
[880]	train-rmse:0.050978	train-rmspe:0.052183
[881]	train-rmse:0.050954	train-rmspe:0.05216
[882]	train-rmse:0.050934	train-rmspe:0.052139
[883]	train-rmse:0.05091	train-rmspe:0.052114
[884]	train-rmse:0.050875	train-rmspe:0.052077
[885]	train-rmse:0.050852	train-rmspe:0.052045
[886]	train-rmse:0.05082	train-rmspe:0.052001
[887]	train-rmse:0.050794	train-rmspe:0.051966
[888]	train-rmse:0.05077	train-rmspe:0.051931
[889]	train-rmse:0.050753	train-rmspe:0.051912
[890]	train-rmse:0.050731	train-rmspe:0.051889
[891]	train-rmse:0.050706	train-rmspe:0.051861
[892]	train-rmse:0.050689	train-rmspe:0.051843
[893]	train-rmse:0.050669	train-rmspe:0.051822
[894]	train-rmse:0.050644	train-rmspe:0.051792
[895]	train-rmse:0.050625	train-rmspe:0.051773
[896]	train-rmse:0.050604	train-rmspe:0.05175
[897]	train-rmse:0.050577	train-rmspe:0.051722
[898]	train-rmse:0.050553	train-rmspe:0.0517
[899]	train-rmse:0.050527	train-rmspe:0.051676
[900]	train-rmse:0.0

[1053]	train-rmse:0.047177	train-rmspe:0.048012
[1054]	train-rmse:0.047161	train-rmspe:0.047996
[1055]	train-rmse:0.04715	train-rmspe:0.047985
[1056]	train-rmse:0.047127	train-rmspe:0.047961
[1057]	train-rmse:0.047091	train-rmspe:0.047925
[1058]	train-rmse:0.047067	train-rmspe:0.047889
[1059]	train-rmse:0.047053	train-rmspe:0.047878
[1060]	train-rmse:0.047041	train-rmspe:0.047866
[1061]	train-rmse:0.047021	train-rmspe:0.047846
[1062]	train-rmse:0.047012	train-rmspe:0.047836
[1063]	train-rmse:0.04699	train-rmspe:0.047814
[1064]	train-rmse:0.046972	train-rmspe:0.047797
[1065]	train-rmse:0.046955	train-rmspe:0.047779
[1066]	train-rmse:0.046936	train-rmspe:0.04776
[1067]	train-rmse:0.046911	train-rmspe:0.047733
[1068]	train-rmse:0.046886	train-rmspe:0.047709
[1069]	train-rmse:0.046877	train-rmspe:0.047697
[1070]	train-rmse:0.046861	train-rmspe:0.047679
[1071]	train-rmse:0.046839	train-rmspe:0.047655
[1072]	train-rmse:0.046828	train-rmspe:0.047644
[1073]	train-rmse:0.046805	train-rmspe:0.04

[1225]	train-rmse:0.044027	train-rmspe:0.044666
[1226]	train-rmse:0.044012	train-rmspe:0.044651
[1227]	train-rmse:0.043995	train-rmspe:0.044634
[1228]	train-rmse:0.043976	train-rmspe:0.044614
[1229]	train-rmse:0.043964	train-rmspe:0.044603
[1230]	train-rmse:0.043949	train-rmspe:0.044594
[1231]	train-rmse:0.043931	train-rmspe:0.044575
[1232]	train-rmse:0.043924	train-rmspe:0.044567
[1233]	train-rmse:0.043904	train-rmspe:0.044548
[1234]	train-rmse:0.043896	train-rmspe:0.044539
[1235]	train-rmse:0.043883	train-rmspe:0.044525
[1236]	train-rmse:0.043863	train-rmspe:0.044504
[1237]	train-rmse:0.043842	train-rmspe:0.044477
[1238]	train-rmse:0.043828	train-rmspe:0.044463
[1239]	train-rmse:0.043811	train-rmspe:0.044445
[1240]	train-rmse:0.043794	train-rmspe:0.044427
[1241]	train-rmse:0.043771	train-rmspe:0.0444
[1242]	train-rmse:0.043747	train-rmspe:0.044375
[1243]	train-rmse:0.04373	train-rmspe:0.044357
[1244]	train-rmse:0.043715	train-rmspe:0.044342
[1245]	train-rmse:0.043693	train-rmspe:0.04

[1397]	train-rmse:0.041254	train-rmspe:0.041726
[1398]	train-rmse:0.041238	train-rmspe:0.041708
[1399]	train-rmse:0.04122	train-rmspe:0.041689
[1400]	train-rmse:0.041205	train-rmspe:0.041674
[1401]	train-rmse:0.041189	train-rmspe:0.041657
[1402]	train-rmse:0.041168	train-rmspe:0.041636
[1403]	train-rmse:0.041153	train-rmspe:0.041621
[1404]	train-rmse:0.041139	train-rmspe:0.041607
[1405]	train-rmse:0.041123	train-rmspe:0.041591
[1406]	train-rmse:0.041104	train-rmspe:0.041569
[1407]	train-rmse:0.041083	train-rmspe:0.041547
[1408]	train-rmse:0.041066	train-rmspe:0.04153
[1409]	train-rmse:0.041049	train-rmspe:0.041512
[1410]	train-rmse:0.041038	train-rmspe:0.041501
[1411]	train-rmse:0.041033	train-rmspe:0.041495
[1412]	train-rmse:0.041013	train-rmspe:0.041475
[1413]	train-rmse:0.041	train-rmspe:0.041461
[1414]	train-rmse:0.040984	train-rmspe:0.041445
[1415]	train-rmse:0.04096	train-rmspe:0.041418
[1416]	train-rmse:0.040943	train-rmspe:0.0414
[1417]	train-rmse:0.040934	train-rmspe:0.041391


[1569]	train-rmse:0.038813	train-rmspe:0.039173
[1570]	train-rmse:0.038797	train-rmspe:0.039156
[1571]	train-rmse:0.038785	train-rmspe:0.039144
[1572]	train-rmse:0.038781	train-rmspe:0.039141
[1573]	train-rmse:0.03877	train-rmspe:0.039129
[1574]	train-rmse:0.038754	train-rmspe:0.039113
[1575]	train-rmse:0.038737	train-rmspe:0.039095
[1576]	train-rmse:0.038724	train-rmspe:0.039083
[1577]	train-rmse:0.038707	train-rmspe:0.039066
[1578]	train-rmse:0.038689	train-rmspe:0.039047
[1579]	train-rmse:0.038672	train-rmspe:0.039028
[1580]	train-rmse:0.038667	train-rmspe:0.039023
[1581]	train-rmse:0.038648	train-rmspe:0.039003
[1582]	train-rmse:0.038633	train-rmspe:0.038989
[1583]	train-rmse:0.038623	train-rmspe:0.038978
[1584]	train-rmse:0.038608	train-rmspe:0.038964
[1585]	train-rmse:0.038596	train-rmspe:0.038951
[1586]	train-rmse:0.038587	train-rmspe:0.038942
[1587]	train-rmse:0.038571	train-rmspe:0.038926
[1588]	train-rmse:0.038557	train-rmspe:0.038913
[1589]	train-rmse:0.038543	train-rmspe:0.

[1741]	train-rmse:0.036583	train-rmspe:0.036859
[1742]	train-rmse:0.036571	train-rmspe:0.036848
[1743]	train-rmse:0.036559	train-rmspe:0.036835
[1744]	train-rmse:0.036546	train-rmspe:0.036822
[1745]	train-rmse:0.036533	train-rmspe:0.036809
[1746]	train-rmse:0.036515	train-rmspe:0.036791
[1747]	train-rmse:0.036504	train-rmspe:0.03678
[1748]	train-rmse:0.036493	train-rmspe:0.036769
[1749]	train-rmse:0.036481	train-rmspe:0.036757
[1750]	train-rmse:0.03647	train-rmspe:0.036745
[1751]	train-rmse:0.036455	train-rmspe:0.036729
[1752]	train-rmse:0.036444	train-rmspe:0.036718
[1753]	train-rmse:0.036433	train-rmspe:0.036707
[1754]	train-rmse:0.036417	train-rmspe:0.036691
[1755]	train-rmse:0.036407	train-rmspe:0.03668
[1756]	train-rmse:0.036399	train-rmspe:0.036672
[1757]	train-rmse:0.036386	train-rmspe:0.03666
[1758]	train-rmse:0.036376	train-rmspe:0.036649
[1759]	train-rmse:0.036363	train-rmspe:0.036637
[1760]	train-rmse:0.03635	train-rmspe:0.036623
[1761]	train-rmse:0.036339	train-rmspe:0.0366

[1913]	train-rmse:0.034578	train-rmspe:0.034811
[1914]	train-rmse:0.034569	train-rmspe:0.034802
[1915]	train-rmse:0.034557	train-rmspe:0.034789
[1916]	train-rmse:0.034549	train-rmspe:0.034781
[1917]	train-rmse:0.034535	train-rmspe:0.034767
[1918]	train-rmse:0.034527	train-rmspe:0.034759
[1919]	train-rmse:0.034518	train-rmspe:0.03475
[1920]	train-rmse:0.034504	train-rmspe:0.034735
[1921]	train-rmse:0.034489	train-rmspe:0.034722
[1922]	train-rmse:0.034482	train-rmspe:0.034715
[1923]	train-rmse:0.034474	train-rmspe:0.034706
[1924]	train-rmse:0.034462	train-rmspe:0.034695


In [None]:
score(model, dval)

In [None]:
#定义一个测试训练函数，用来做试验
def train_t(param, dtrain, num_round, feval_c, stopping):
    train_time = time.clock()

    model = xgb.train(
        param, dtrain, num_round, feval = feval_c, 
        maximize = True,
        evals = [(dtrain, 'train')], early_stopping_rounds = stopping, 
        verbose_eval = True
    )

    train_time = time.clock() - train_time

    print('The training time of the model is: {:.0f}s'.format(train_time))
    
    return model

#设定参数
param = {'max_depth': 10, 'eta': 0.3, 'silent': 0, 'objective': 'reg:linear', 'min_child_weight': 6, 'colsample_bytree': 0.8, 'subsample':0.9}

#设定迭代次数

num_round = 100
stopping = 10

#训练并打分
model = train_t(param, dtrain, num_round, rmspe_feval, stopping)
