这个文件专门用来测试异常值下已训练好的模型的稳定性。

In [6]:
import numpy as np
import xgboost as xgb
import pandas as pd
from IPython.display import display #方便对数据进行可视化
import time
import datetime
from matplotlib import pyplot
def rmspe(y, y_hat):
    #由于sales取了对数，这里要还原
    y = np.exp(y)
    y_hat = np.exp(y_hat)
    
    #由于有yi=0的情况，直接计算会得出无穷大的结果，所以需要处理一下
    #定义一个y_rev来表示y的倒数，y为0时y_rev也赋值为0（因为在项目中提到“ Any day and store with 0 sales is ignored in scoring.“）
    y_rev = np.zeros(y.shape, dtype = float)
    n_zero = y != 0
    y_rev[n_zero] = 1./y[n_zero]

    rmspe = np.sqrt(np.mean(((y - y_hat)*y_rev)**2))
    return rmspe


#自定义一个评价函数

def rmspe_feval(y_hat, dy):
    y = dy.get_label()
    rmspe_score = rmspe(y, y_hat)
    return 'rmspe', rmspe_score


#定义训练函数
def train(param, dtrain, dval, num_round, feval_c, stopping):
    train_time = time.clock()

    model = xgb.train(
        param, dtrain, num_round, feval = feval_c, 
        evals = [(dtrain, 'train'), (dval, 'val')], early_stopping_rounds = stopping, 
        verbose_eval = True
    )

    train_time = time.clock() - train_time

    print('The training time of the model is: {:.0f}s'.format(train_time))
    
    return model

#定义一个打分函数
def score(bst, dval):
    y_pred = bst.predict(dval)
#print(y_pred)
    y_val = dval.get_label()
    score = rmspe(y_val, y_pred)
#print(score)
    print('The rmspe of the model on validation data set is {:.6f}'.format(score))
    pass
# 导入训练数据
train_data = pd.read_csv(
    "train.csv", 
    low_memory=False)

store_features = pd.read_csv("store.csv")
#从头开始处理数据

# 导入训练数据
train_data = pd.read_csv(
    "train.csv", 
    low_memory=False)

store_features = pd.read_csv("store.csv")
# 导入测试数据
test_data = pd.read_csv("test.csv")

train_data['year'] = train_data['Date'].apply(lambda x: float(x.split('-')[0]))
train_data['month'] = train_data['Date'].apply(lambda x: float(x.split('-')[1]))
train_data['day'] = train_data['Date'].apply(lambda x: float(x.split('-')[2]))

test_data['year'] = test_data['Date'].apply(lambda x: float(x.split('-')[0]))
test_data['month'] = test_data['Date'].apply(lambda x: float(x.split('-')[1]))
test_data['day'] = test_data['Date'].apply(lambda x: float(x.split('-')[2]))

#此处增加一步，先将验证集划分出来，用训练集的最后2周作为验证集
#打开数据文件，很容易找到，2015/7/18-2015/7/31时间范围内的数据编号为1 - 15611

val_data = train_data[: 15610]
train_data = train_data[15610:]

#只使用Open和销售不为0的数据
val_data = val_data[(val_data['Open'] == 1) & (val_data['Sales'] != 0)]
train_data = train_data[(train_data['Open'] == 1) & (train_data['Sales'] != 0)]


#融合
train_data = pd.merge(train_data, store_features, on = 'Store')
val_data = pd.merge(val_data, store_features, on = 'Store')
test_data = pd.merge(test_data, store_features, on = 'Store')

train_data = train_data.drop('Customers', axis = 1)
sales_train = train_data['Sales']
features_train = train_data.drop('Sales', axis = 1)

val_data = val_data.drop('Customers', axis = 1)
sales_val = val_data['Sales']
features_val = val_data.drop('Sales', axis = 1)

features_test = test_data.drop('Id', axis = 1)

# 对字符串特征进行独热编码
category = ['StoreType', 'Assortment', 'StateHoliday', 'PromoInterval']


#首先将训练集、验证集、测试集合并，然后统一独热编码，然后再拆分为训练集、验证集、测试集

print('合并前：')
print(features_train.shape[0])
print(features_val.shape[0])
print(features_test.shape[0])

data_merge = pd.concat([features_train, features_val], ignore_index = True)
data_merge = pd.concat([data_merge, features_test], ignore_index = True)
data_merge_pd = pd.get_dummies(data_merge, columns = category)

#把时间数据转化为一年中的第几周

data_merge_pd['Date'] = pd.to_datetime(data_merge_pd['Date'])
data_merge_pd['weekofyear'] = data_merge_pd['Date'].dt.weekofyear
data_merge_pd = data_merge_pd.drop('Date', axis = 1)

# 需要把NA/NAN的数据转化成0
data_merge_pd.fillna(0, inplace = True)


#拆分

features_train = data_merge_pd[ : features_train.shape[0]]
features_val = data_merge_pd[features_train.shape[0] : (features_val.shape[0] + features_train.shape[0])]
features_test = data_merge_pd[ (features_val.shape[0] + features_train.shape[0]) : ]

print('合并后：')
print(features_train.shape[0])
print(features_val.shape[0])
print(features_test.shape[0])



#下面将sales做log处理

sales_train_log = np.log(sales_train) #因为去掉了为0的销售数据，所以不用+1了
sales_val_log = np.log(sales_val)

dtrain = xgb.DMatrix(features_train, label = sales_train_log)
dval = xgb.DMatrix(features_val, label = sales_val_log)



合并前：
830918
13420
41088
合并后：
830918
13420
41088


In [7]:
#载入模型

model  = xgb.Booster()
model.load_model('rossmann.model')
score(model, dval)

The rmspe of the model on validation data set is 0.108691


In [8]:
from random import randint
#我们把验证集中1%的数据的‘Day’这一项，改成一个随机的异常值
one_percent = int(0.01 * features_val.shape[0])
temp = np.array(features_val['day'])
for i in range(0, one_percent):
    index = randint(0, features_val.shape[0] - 1)
    temp[index] = randint(1, 10) * 100
    
#features_val = features_val.drop('day', axis = 1)
features_val['day'] = temp


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [9]:
dtrain = xgb.DMatrix(features_train, label = sales_train_log)
dval = xgb.DMatrix(features_val, label = sales_val_log)

In [10]:
score(model, dval)

The rmspe of the model on validation data set is 0.111902


In [13]:
#我们把验证集中10%的数据的‘Day’这一项，改成一个随机的异常值
one_percent = int(0.1 * features_val.shape[0])
temp = np.array(features_val['day'])
for i in range(0, one_percent):
    index = randint(0, features_val.shape[0] - 1)
    temp[index] = randint(1, 10) * 100
    
#features_val = features_val.drop('day', axis = 1)
features_val['day'] = temp



In [14]:
dtrain = xgb.DMatrix(features_train, label = sales_train_log)
dval = xgb.DMatrix(features_val, label = sales_val_log)
score(model, dval)

ValueError: feature_names mismatch: ['Store', 'DayOfWeek', 'Open', 'Promo', 'SchoolHoliday', 'year', 'month', 'day', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_a', 'Assortment_b', 'Assortment_c', 'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c', 'PromoInterval_Feb,May,Aug,Nov', 'PromoInterval_Jan,Apr,Jul,Oct', 'PromoInterval_Mar,Jun,Sept,Dec', 'weekofyear'] ['Store', 'DayOfWeek', 'Open', 'Promo', 'SchoolHoliday', 'year', 'month', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_a', 'Assortment_b', 'Assortment_c', 'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c', 'PromoInterval_Feb,May,Aug,Nov', 'PromoInterval_Jan,Apr,Jul,Oct', 'PromoInterval_Mar,Jun,Sept,Dec', 'weekofyear', 'day']