# Rossmann Store Sales项目的项目文件


## 载入必要的库
首先，需要载入必要的库。

In [83]:
import numpy as np
import xgboost as xgb
import pandas as pd
from IPython.display import display #方便对数据进行可视化

## 数据预处理
### 数据读入
然后，我们需要读入数据。

In [84]:
# 导入训练数据
train_data = pd.read_csv(
    "train.csv", 
    low_memory=False)

store_features = pd.read_csv("store.csv")
# 导入测试数据
test_data = pd.read_csv("test.csv")

# 显示头5条记录

display(train_data.head(n=5))
display(store_features.head(n=5))
display(test_data.head(n=5))

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


然后我们需要将train_data与store_feature按照商店编号（Store这个特征域）融合到一个数据集中。对于test数据也做同样处理。



In [85]:
#融合
train_data = pd.merge(train_data, store_features, on = 'Store')
test_data = pd.merge(test_data, store_features, on = 'Store')
#显示头5条记录
display(train_data.head(n=5))
display(test_data.head(n=5))

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,2015-09-17,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
1,857,1,3,2015-09-16,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
2,1713,1,2,2015-09-15,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
3,2569,1,1,2015-09-14,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
4,3425,1,7,2015-09-13,0.0,0,0,0,c,a,1270.0,9.0,2008.0,0,,,


观察test.csv的数据可以发现，Date这个数据都是2015年8月1日之后的，而train.csv的日期最晚到2015年7月31日，而Date数据的类型是字符串，相当于分类数据，直接使用数据将导致test中的Date数据是train中没有出现过的分类，因此要么转换成数值数据，要么把日期拆分成年，月，日3个分类数据。这里将字符串的时间转化为时间戳数值数据。


In [86]:
#将字符串转化为时间戳数值
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

#然后再把时间戳数值转化为浮点值

train_data['Date'] = pd.to_numeric(train_data['Date'], downcast = 'float')
test_data['Date'] = pd.to_numeric(test_data['Date'], downcast = 'float')

#继续通过显示头5条数据来验证转化情况

display(train_data.head(n=5))
print(type(train_data['Date'][1])) #检验一下转化之后的数据类型
print(train_data['Date'][1]-train_data['Date'][2])
display(test_data.head(n=5))

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,1.438301e+18,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,1,4,1.438214e+18,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
2,1,3,1.438128e+18,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
3,1,2,1.438042e+18,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
4,1,1,1.437955e+18,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,


<class 'numpy.float32'>
86449100000000.0


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,1.442448e+18,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
1,857,1,3,1.442362e+18,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
2,1713,1,2,1.442275e+18,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
3,2569,1,1,1.442189e+18,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
4,3425,1,7,1.442102e+18,0.0,0,0,0,c,a,1270.0,9.0,2008.0,0,,,


可以看到转换后的数值极其的大，我们需要将其归一化

In [87]:
start_date = min(train_data['Date'])
end_date = max(test_data['Date'])
train_data['Date'] = (train_data['Date'] - start_date)/(end_date - start_date)
test_data['Date'] = (test_data['Date'] - start_date)/(end_date - start_date)
#看一下转化后的结果
display(train_data.head(n=5))
display(test_data.head(n=5))

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,0.951466,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,1,4,0.950454,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
2,1,3,0.949443,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
3,1,2,0.948432,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
4,1,1,0.947421,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,1.0,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
1,857,1,3,0.998988,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
2,1713,1,2,0.997977,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
3,2569,1,1,0.996967,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
4,3425,1,7,0.995955,0.0,0,0,0,c,a,1270.0,9.0,2008.0,0,,,


观察可以看到，测试集中是没有customer这个特征的，所以在训练集中需要去掉这个特征

In [88]:
train_data = train_data.drop('Customers', axis = 1)

下面把数据切分成特征和标签

In [89]:
sales = train_data['Sales']
features = train_data.drop('Sales', axis = 1)

display(sales.head(n=1))
display(features.head(n=1))

0    5263
Name: Sales, dtype: int64

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,0.951466,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,


需要把一些不规范的数据处理一下

In [90]:
# 需要把NA/NAN的数据转化成0
features.fillna(0, inplace = True)
test_data.fillna(0, inplace =True)

display(features.head(n=5))
display(test_data.head(n=5))

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,0.951466,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,1,4,0.950454,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
2,1,3,0.949443,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
3,1,2,0.948432,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
4,1,1,0.947421,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,1.0,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,857,1,3,0.998988,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
2,1713,1,2,0.997977,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
3,2569,1,1,0.996967,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
4,3425,1,7,0.995955,0.0,0,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0


In [91]:
# 对字符串特征进行独热编码
category = ['StoreType', 'Assortment', 'PromoInterval', 'StateHoliday']

features_pd = pd.get_dummies(features, columns = category)
test_data_pd = pd.get_dummies(test_data, columns = category)

display(features_pd.head(n=5))
display(test_data_pd.head(n=5))

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,...,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,5,0.951466,1,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
1,1,4,0.950454,1,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
2,1,3,0.949443,1,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
3,1,2,0.948432,1,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
4,1,1,0.947421,1,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,...,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",StateHoliday_0,StateHoliday_a
0,1,1,4,1.0,1.0,1,0,1270.0,9.0,2008.0,...,0,1,0,0,1,0,0,0,1,0
1,857,1,3,0.998988,1.0,1,0,1270.0,9.0,2008.0,...,0,1,0,0,1,0,0,0,1,0
2,1713,1,2,0.997977,1.0,1,0,1270.0,9.0,2008.0,...,0,1,0,0,1,0,0,0,1,0
3,2569,1,1,0.996967,1.0,1,0,1270.0,9.0,2008.0,...,0,1,0,0,1,0,0,0,1,0
4,3425,1,7,0.995955,0.0,0,0,1270.0,9.0,2008.0,...,0,1,0,0,1,0,0,0,1,0


可视化可以看到在训练集独热编码后为27列，而测试集只有26列，去掉训练集没有的Id，只有25列，可见测试集中比训练集少了一些类别，需要改变一下编码方式

In [92]:
#首先将训练集和测试集合并，然后统一独热编码，然后再拆分为训练集和测试集
features_test = test_data.drop('Id', axis = 1)
data_merge = pd.concat([features, features_test], ignore_index = True)
data_merge_pd = pd.get_dummies(data_merge, columns = category)
display(data_merge_pd.head(n=5))

#拆分
print(features.shape)
print(features_test.shape)
features_pd = data_merge_pd[0:features.shape[0]]
features_test_pd = data_merge_pd[features.shape[0]:data_merge_pd.shape[0]]
print(features_pd.shape)
print(features_test_pd.shape)

display(features_pd.head(n=5))
display(features_test_pd.head(n=5))

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,...,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,5,0.951466,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
1,1,4,0.950454,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
2,1,3,0.949443,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
3,1,2,0.948432,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
4,1,1,0.947421,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0


(1017209, 16)
(41088, 16)
(1017209, 27)
(41088, 27)


Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,...,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,5,0.951466,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
1,1,4,0.950454,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
2,1,3,0.949443,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
3,1,2,0.948432,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
4,1,1,0.947421,1.0,1,1,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0


Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,...,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
1017209,1,4,1.0,1.0,1,0,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
1017210,1,3,0.998988,1.0,1,0,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
1017211,1,2,0.997977,1.0,1,0,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
1017212,1,1,0.996967,1.0,1,0,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0
1017213,1,7,0.995955,0.0,0,0,1270.0,9.0,2008.0,0,...,0,0,1,0,0,0,1,0,0,0


下面切分数据，把数据切分成训练集和验证集

In [93]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    features_pd, sales, test_size = 0.2, random_state = 1)

display(X_train.head(n=5))
display(y_train.head(n=5))

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,...,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
903560,991,2,0.339737,1.0,1,0,1010.0,0.0,0.0,0,...,0,0,1,0,0,0,1,0,0,0
509452,559,5,0.385238,1.0,0,0,3910.0,11.0,2006.0,1,...,0,0,0,1,0,0,1,0,0,0
570376,626,5,0.73913,1.0,0,1,10740.0,11.0,2013.0,0,...,0,1,1,0,0,0,1,0,0,0
264963,292,5,0.165823,1.0,0,0,1100.0,6.0,2009.0,0,...,0,0,1,0,0,0,1,0,0,0
961639,1055,2,0.898887,1.0,0,0,1980.0,4.0,2009.0,0,...,0,0,1,0,0,0,1,0,0,0


903560    7049
509452    4990
570376    9533
264963    5274
961639    6228
Name: Sales, dtype: int64

下面引入模型

对xgboost的使用主要参考了以下资源：
- https://blog.csdn.net/zc02051126/article/details/46771793
- https://blog.csdn.net/u010657489/article/details/51952785
- https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
- https://blog.csdn.net/u011630575/article/details/79418138
- https://datascience.stackexchange.com/questions/12799/pandas-dataframe-to-dmatrix

In [94]:
dtrain = xgb.DMatrix(X_train, label = y_train)
dval = xgb.DMatrix(X_val, label = y_val)

下面要对模型进行训练

In [98]:
#首先设定参数

param = {'max_depth': 5, 'eta': 0.3, 'silent': 0, 'objective': 'reg:linear'}

#设定迭代次数

num_round = 100

#训练模型，记录训练时间
#定义一个函数，便于后期进行参数调优反复调用

import time
def train(param, dtrain, num_round):
    train_time = time.clock()

    bst = xgb.train(param, dtrain, num_round)

    train_time = time.clock() - train_time

    print('The training time of the model is: {:.0f}s'.format(train_time))
    
    return bst

model = train(param, dtrain, num_round)

The training time of the model is: 120s


In [99]:
#需要定义一个函数来计算均⽅方根百分⽐比误差RMSPE

def rmspe(y, y_hat):
    #由于有yi=0的情况，直接计算会得出无穷大的结果，所以需要处理一下
    #定义一个y_rev来表示y的倒数，y为0时y_rev也赋值为0（因为在项目中提到“ Any day and store with 0 sales is ignored in scoring.“
    y_rev = np.zeros(y.shape, dtype = float)
    n_zero = y != 0
    y_rev[n_zero] = 1./y[n_zero]
    rmspe = np.sqrt(np.mean(((y - y_hat)*y_rev)**2))
    return rmspe

#定义一个打分函数
def score(bst, dval):
    y_pred = bst.predict(dval)
#print(y_pred)
    score = rmspe(y_val, y_pred)
#print(score)
    print('The rmspe of the model on validation data set is {:.6f}'.format(score))
    pass

#来看看验证集上的结果
score(model, dval)

The rmspe of the model on validation data set is 0.254764


可以看到这个得分不高，所以下面进行参数调优，主要参考以下文章
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [105]:
#使用sklearn 的gridsearch来调试参数
from sklearn.model_selection import GridSearchCV
param_test = {'max_depth':[3, 5, 10], 'learning_rate':[0.1, 0.2, 0.3], 'n_estimators':range(100, 500, 100)}
gs = GridSearchCV(estimator = xgb.XGBRegressor(
    max_depth = 5, learning_rate = 0.3, n_estimators = 100, objective = 'reg:linear'), 
    param_grid = param_test, cv = 5)
gs.fit(features_pd, sales)
gs.grid_scores_
gs.best_params_
gs.best_score_



0.6011407596178118

In [107]:
print(gs.cv_results_)
print(gs.grid_scores_)
print(gs.best_params_)
print(gs.best_score_)

{'mean_fit_time': array([ 38.29500966,  73.33819566, 109.38071952, 148.69739513,
        60.38451915, 119.01575274, 178.62755346, 237.79561958,
       129.90490026, 255.53009043, 380.68921642, 506.24711437,
        36.43985925,  71.65543413, 106.95870748, 141.88957968,
        59.98844581, 118.56307244, 176.3666811 , 234.58779626,
       128.40726099, 254.86822767, 379.01939301, 504.59557786,
        36.37449689,  71.71901202, 106.60218821, 142.4205204 ,
        59.87384772, 118.55770693, 178.29352837, 237.13244195,
       128.11230216, 253.35254169, 378.36284375, 507.19486341]), 'std_fit_time': array([0.81137969, 1.24270424, 1.06789233, 3.64462602, 0.18730294,
       0.54766789, 0.97548006, 1.40017015, 1.54499309, 1.35011651,
       2.03860987, 1.95390807, 0.08487395, 0.39691678, 0.31383511,
       0.63004823, 0.36147024, 1.10875431, 0.94795429, 0.71441602,
       0.6326455 , 1.67959637, 2.32384139, 2.26955966, 0.12063387,
       0.27920965, 0.47108049, 1.06722675, 0.16054269, 0.61968



那么我们尝试用得到的最佳参数再训练一次

In [112]:
#首先设定参数

param = {'max_depth': 5, 'eta': 0.1, 'silent': 0, 'objective': 'reg:linear'}

#设定迭代次数

num_round = 100
model_op1 = train(param, dtrain, num_round)
score(model_op1, dval)



The training time of the model is: 84s
The rmspe of the model on validation data set is 0.368759


效果并不好，可能是过度强调泛化能力，而欠拟合造成的，尝试增加最大深度和迭代次数，并增加学习率

In [111]:
#首先设定参数

param = {'max_depth': 10, 'eta': 0.3, 'silent': 0, 'objective': 'reg:linear'}

#设定迭代次数

num_round = 300
model_op1 = train(param, dtrain, num_round)
score(model_op1, dval)


The training time of the model is: 513s
The rmspe of the model on validation data set is 0.120045


这个结果还不错！在测试集上跑跑看。

下面进行测试

In [113]:
dtest = xgb.DMatrix(features_test_pd)
sales_pred = model_op1.predict(dtest)
submission = pd.DataFrame({'Id':test_data['Id'], 'Sales': sales_pred})
submission.to_csv('xgboost_submit.csv', index = False)