这个文件专门用来测试异常值下已训练好的模型的稳定性。

In [1]:
import numpy as np
import xgboost as xgb
import pandas as pd
from IPython.display import display #方便对数据进行可视化
from sklearn.model_selection import train_test_split
#从头开始处理数据

# 导入训练数据
train_data = pd.read_csv(
    "train.csv", 
    low_memory=False)

store_features = pd.read_csv("store.csv")
# 导入测试数据
test_data = pd.read_csv("test.csv")

#融合
train_data = pd.merge(train_data, store_features, on = 'Store')
test_data = pd.merge(test_data, store_features, on = 'Store')

train_data = train_data.drop('Customers', axis = 1)
sales = train_data['Sales']
features = train_data.drop('Sales', axis = 1)

# 对字符串特征进行独热编码
category = ['StoreType', 'Assortment', 'PromoInterval', 'StateHoliday']


#首先将训练集和测试集合并，然后统一独热编码，然后再拆分为训练集和测试集
features_test = test_data.drop('Id', axis = 1)
data_merge = pd.concat([features, features_test], ignore_index = True)
data_merge_pd = pd.get_dummies(data_merge, columns = category)

# 需要把NA/NAN的数据转化成0
data_merge_pd.fillna(0, inplace = True)

#增加一步，增加‘Year', 'month', 'day'三个有序分类数据

data_merge_pd['year'] = data_merge_pd['Date'].apply(lambda x: float(x.split('-')[0]))
data_merge_pd['month'] = data_merge_pd['Date'].apply(lambda x: float(x.split('-')[1]))
data_merge_pd['day'] = data_merge_pd['Date'].apply(lambda x: float(x.split('-')[2]))
data_merge_pd = data_merge_pd.drop('Date', axis = 1)

#拆分

features_pd = data_merge_pd[0:features.shape[0]]
features_test_pd = data_merge_pd[features.shape[0]:data_merge_pd.shape[0]]

#下面将sales做log处理

sales_log = np.log(train_data['Sales'] + 1) #+1是为了避免log（0）的情况同时使得数据从0开始


X_train, X_val, y_train, y_val = train_test_split(
    features_pd, sales_log, test_size = 0.2, random_state = 99)#重新划分一下



In [2]:
dtrain = xgb.DMatrix(X_train, label = y_train)
dval = xgb.DMatrix(X_val, label = y_val)

In [3]:
#载入模型

model  = xgb.Booster()
model.load_model('rossmann.model')

def rmspe(y, y_hat):
    #由于sales取了对数，这里要还原
    y = np.exp(y) - 1
    y_hat = np.exp(y_hat) - 1
    
    #由于有yi=0的情况，直接计算会得出无穷大的结果，所以需要处理一下
    #定义一个y_rev来表示y的倒数，y为0时y_rev也赋值为0（因为在项目中提到“ Any day and store with 0 sales is ignored in scoring.“）
    y_rev = np.zeros(y.shape, dtype = float)
    n_zero = y != 0
    y_rev[n_zero] = 1./y[n_zero]

    rmspe = np.sqrt(np.mean(((y - y_hat)*y_rev)**2))
    return rmspe

#定义一个打分函数
def score(bst, dval):
    y_pred = bst.predict(dval)
    print(y_pred)
    y_val = dval.get_label()
    score = rmspe(y_val, y_pred)
    print(score)
    print('The rmspe of the model on validation data set is {:.6f}'.format(score))
    pass

score(model, dval)

[ 9.092271e+00  8.675215e+00 -2.974093e-03 ...  9.222454e+00  8.534106e+00
  9.355307e-03]
0.07235972307593926
The rmspe of the model on validation data set is 0.072360


In [4]:
from random import randint
#我们把验证集中1%的数据的‘Day’这一项，改成一个随机的异常值
one_percent = int(0.01 * X_val.shape[0])
temp = np.array(X_val['day'])
for i in range(0, one_percent):
    index = randint(0, X_val.shape[0] - 1)
    temp[index] = randint(1, 10) * 100
    
X_val = X_val.drop('day', axis = 1)
X_val['day'] = temp


In [5]:
dtrain = xgb.DMatrix(X_train, label = y_train)
dval = xgb.DMatrix(X_val, label = y_val)

In [6]:
score(model, dval)

[ 9.092271e+00  8.675215e+00 -2.974093e-03 ...  9.222454e+00  8.534106e+00
  9.355307e-03]
0.07610700502446097
The rmspe of the model on validation data set is 0.076107


In [7]:
#我们把验证集中10%的数据的‘Day’这一项，改成一个随机的异常值
one_percent = int(0.01 * X_val.shape[0])
temp = np.array(X_val['day'])
for i in range(0, one_percent):
    index = randint(0, X_val.shape[0] - 1)
    temp[index] = randint(1, 10) * 100
    
X_val = X_val.drop('day', axis = 1)
X_val['day'] = temp
dtrain = xgb.DMatrix(X_train, label = y_train)
dval = xgb.DMatrix(X_val, label = y_val)
score(model, dval)

[ 9.092271e+00  8.675215e+00 -2.974093e-03 ...  9.222454e+00  8.534106e+00
  9.355307e-03]
0.07998797411602782
The rmspe of the model on validation data set is 0.079988
