In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [None]:
train_set = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
validation_set = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
items_data = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
item_categories_data = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shops_data = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
#读取数据

In [None]:
train_set.head(10)
#预览一下训练集数据的格式

In [None]:
train_set.isnull().sum()
#查看是否有缺失值

In [None]:
ax1 = plt.subplot(211)
plt.ylim(train_set.item_cnt_day.min(), train_set.item_cnt_day.max()*1.2)
ax1.boxplot(x=train_set.item_cnt_day)
plt.xlabel('item_cnt_day')
ax2 = plt.subplot(212)
plt.ylim(train_set.item_price.min(), train_set.item_price.max()*1.2)
ax2.boxplot(x=train_set.item_price)
plt.xlabel('item_price')

In [None]:
train_set = train_set[train_set.item_price<100000]
train_set = train_set[(train_set.item_cnt_day<1001)&
                      (train_set.item_cnt_day>=0)]
#通过画图发现item_cnt_day和item_price两项存在异常值
#总体来看仅有0.25%左右的异常值，所以选择剔除这些异常值

In [None]:
combine = []
for i in range(34):
    sales = train_set[train_set.date_block_num==i]
    for j in sales.shop_id.unique():
        for k in sales.item_id.unique():
            p = (i,j,k)
            combine.append(np.array(list(p)))
cols = ['date_block_num','shop_id','item_id']
combine = pd.DataFrame(np.vstack(combine), columns=cols)
#按照垂直方向排列数组
#combine是月份、商店、商品三种变量所有的排列组合

In [None]:
grouped = train_set.groupby(['item_id','shop_id','date_block_num']).agg({'item_cnt_day':'sum'})
grouped.columns = ['item_cnt_month']
grouped.reset_index(inplace=True)
grouped.head()
#grouped表是月份、商店、商品三种变量排列组合在训练集中都有值的情况
#可以发现并不是所有商品所有商店每月都有销量

In [None]:
combine = combine.merge(grouped, on = ['item_id','shop_id','date_block_num'], how = 'left')
combine['item_cnt_month'] = combine['item_cnt_month'].fillna(0).clip(0,20)
combine.head()
#将combine和grouped两个表组合后，得到的是所有排列组合的每月销量
#如果没有值的话说明当月这个商店的这个产品没有销售额，就用0来填充
#并将销量的结果限制在0~20之间

In [None]:
combine = pd.merge(combine,items_data,on=["item_id"],how="left")
#将combine与商品表合并

In [None]:
def ItemCatSplit(x):
    if '-' in x:
        cat = x.split(' - ')[0]
    else:
        cat = x
    return cat
item_categories_data['Cat'] = [ItemCatSplit(i) for i in item_categories_data['item_category_name']]
#将类别名字分离开，识别出每一类商品的大类，赋值为新的特征

In [None]:
def ShopNameSplit(x):
    Provice = x.split(' ')[0]
    return Provice
shops_data['Location'] = [ShopNameSplit(i) for i in shops_data['shop_name']]
#从店铺的名称中提取出店铺所在地的名字，赋值为新的特征

In [None]:
train = pd.merge(combine,item_categories_data,how='left',on='item_category_id')
train = pd.merge(train,shops_data,how='left',on='shop_id')
train = train.drop(columns=['item_category_name','item_name','shop_name'])
train.head()
#将多个表合并，筛选有用的信息后命名为train表

In [None]:
col=list(train.columns)
for i in col:
    print('特征%s的种类有%d种'%(i,len(train[i].unique())))

In [None]:
test = pd.merge(validation_set,items_data,on="item_id",how="left")
test = pd.merge(test,item_categories_data,how='left',on='item_category_id')
test = pd.merge(test,shops_data,how='left',on='shop_id')
test = test.drop(columns = ['item_name','item_category_name','shop_name'])
test["date_block_num"] = 34
test.head()
#将测试集处理为与训练集相似的格式

In [None]:
train_test = pd.concat([train,test])
train_test.isnull().sum()
#将训练集和测试集合并命名为train_test表，方便进行统一的数据处理

可以发现train_test表中的item_cnt_month和ID两列有缺失值  
这代表训练集train中没有ID这一列  
测试集test中没有item_cnt_month这一列

In [None]:
oer = OrdinalEncoder()
oe = oer.fit_transform(train_test.iloc[:,5:6])
train_test['Cat']=oe
oe = oer.fit_transform(train_test.iloc[:,6:7])
train_test['Location']=oe
#将文字标签转化为数值型

In [None]:
grouped = train_test.groupby(['shop_id','date_block_num']).agg({'item_cnt_month':'mean'})
grouped.columns = ['shop_last_month_mean']
grouped.reset_index(inplace=True)
grouped['date_block_num']=grouped['date_block_num']+1
train_test = train_test.merge(grouped, on = ['shop_id','date_block_num'], how = 'left')
train_test['shop_last_month_mean'] = train_test['shop_last_month_mean'].fillna(0)
#添加一个特征shop_last_month_mean，表示上个月每个商店的平均销量
#将其作为本月的特征

In [None]:
grouped = train_test.groupby(['item_id','date_block_num']).agg({'item_cnt_month':'mean'})
grouped.columns = ['item_last_month_mean']
grouped.reset_index(inplace=True)
grouped['date_block_num']=grouped['date_block_num']+1
train_test = train_test.merge(grouped, on = ['item_id','date_block_num'], how = 'left')
train_test['item_last_month_mean'] = train_test['item_last_month_mean'].fillna(0)
#添加一个特征item_last_month_mean，表示上个月每个商品的平均销量
#将其作为本月的特征

In [None]:
grouped = train_test.groupby(['item_category_id','date_block_num']).agg({'item_cnt_month':'mean'})
grouped.columns = ['category_last_month_mean']
grouped.reset_index(inplace=True)
grouped['date_block_num']=grouped['date_block_num']+1
train_test = train_test.merge(grouped, on=['item_category_id','date_block_num'],how='left')
train_test['category_last_month_mean'] = train_test['category_last_month_mean'].fillna(0)
#添加一个特征category_lsat_month_mean，表示上个月每个类别的平均销量

In [None]:
grouped = train_test.groupby(['shop_id','date_block_num']).agg({'item_cnt_month':'mean'})
grouped.columns = ['shop_3month_ago_mean']
grouped.reset_index(inplace=True)
grouped['date_block_num']=grouped['date_block_num']+3
train_test = train_test.merge(grouped, on = ['shop_id','date_block_num'], how = 'left')
train_test['shop_3month_ago_mean'] = train_test['shop_3month_ago_mean'].fillna(0)
#添加一个特征shop_3month_ago_mean，表示3个月前每个商店的平均销量
#将其作为本月的特征

In [None]:
grouped = train_test.groupby(['item_id','date_block_num']).agg({'item_cnt_month':'mean'})
grouped.columns = ['item_3month_ago_mean']
grouped.reset_index(inplace=True)
grouped['date_block_num']=grouped['date_block_num']+3
train_test = train_test.merge(grouped, on = ['item_id','date_block_num'], how = 'left')
train_test['item_3month_ago_mean'] = train_test['item_3month_ago_mean'].fillna(0)
#添加一个特征item_3month_ago_mean，表示3个月前每个商品的平均销量
#将其作为本月的特征

In [None]:
grouped = train_test.groupby(['item_category_id','date_block_num']).agg({'item_cnt_month':'mean'})
grouped.columns = ['category_3month_ago_mean']
grouped.reset_index(inplace=True)
grouped['date_block_num']=grouped['date_block_num']+3
train_test = train_test.merge(grouped, on=['item_category_id','date_block_num'],how='left')
train_test['category_3month_ago_mean'] = train_test['category_3month_ago_mean'].fillna(0)
#添加一个特征category_3month_ago_mean，表示上个月每个类别的平均销量
#将其作为本月的特征

In [None]:
validation = train_test[train_test['date_block_num']==34]
#validation为最终的验证集

In [None]:
train_x=train_test.query('date_block_num<33').drop(columns=['item_cnt_month','ID']).values
test_x=train_test[train_test['date_block_num']==33].drop(columns=['item_cnt_month','ID']).values
train_y=train_test.query('date_block_num<33')['item_cnt_month'].values
test_y = train_test[train_test['date_block_num']==33]['item_cnt_month'].values
#将最后一个月为划分为训练集

In [None]:
rmse_list=[]

In [None]:
for md in np.arange(5,15):
    t = datetime.now()
    tree=DecisionTreeRegressor(max_depth=md,min_samples_leaf=5,random_state=42)
    tree.fit(train_x,train_y)
    pred_y=tree.predict(test_x)
    rmse = np.sqrt(mean_squared_error(test_y,pred_y))
    rmse_list.append(rmse)
    delta = datetime.now()-t
    print(f'用时{delta}，rmse为{rmse}')
#循环搜索最佳参数

In [None]:
plt.plot(np.arange(5,15),rmse_list)
#可知11为max_depth的最佳参数

In [None]:
rmse_list=[]

In [None]:
for msl in range(1,10):
    t = datetime.now()
    tree=DecisionTreeRegressor(max_depth=11,min_samples_leaf=msl,random_state=42)
    tree.fit(train_x,train_y)
    pred_y=tree.predict(test_x)
    rmse = np.sqrt(mean_squared_error(test_y,pred_y))
    rmse_list.append(rmse)
    delta = datetime.now()-t
    print(f'用时{delta}，rmse为{rmse}')

In [None]:
plt.plot(range(1,10),rmse_list) 
#可知7为min_samples_leaf为最佳参数

In [None]:
ID = validation['ID']
validation = validation.drop(columns=['item_cnt_month','ID'])
validation_x=validation.values
#处理好验证集

In [None]:
train_x=train_test.query('date_block_num<=33').drop(columns=['item_cnt_month','ID']).values
train_y=train_test.query('date_block_num<=33')['item_cnt_month'].values
tree=DecisionTreeRegressor(max_depth=11,min_samples_leaf=7,random_state=42)
tree.fit(train_x,train_y)
Yhat = tree.predict(validation_x)
#根据生成的决策树对验证集进行验证

In [None]:
ID = ID.astype('int64')
Submission = pd.DataFrame({'ID':ID,
                           'item_cnt_month':Yhat})
Submission.to_csv('Submission for PFS.csv', index=False)
#生成提交文件