In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as plt 
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import lightgbm as lgbm
import warnings
warnings.filterwarnings("ignore")
from itertools import product # 生成笛卡尔积
from sklearn.preprocessing import LabelEncoder
import pickle

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **数据整理部分**

In [None]:
os.chdir("/kaggle/input/competitive-data-science-predict-future-sales")
sales_train = pd.read_csv("sales_train.csv")
# items = pd.read_csv("items.csv")
# shops = pd.read_csv("shops.csv")
# item_categories = pd.read_csv("item_categories.csv")
# test = pd.read_csv("test.csv")
sales_train.head()

In [None]:
items = pd.read_csv("items.csv")

In [None]:
shops = pd.read_csv("shops.csv")

In [None]:
item_categories = pd.read_csv("item_categories.csv")

In [None]:
test = pd.read_csv("test.csv")

In [None]:
# import pandas_profiling

In [None]:
plt.figure(figsize = (10,4))
plt.xlim(-100,3000)
sns.boxplot(sales_train.item_cnt_day)
plt.show()

In [None]:
plt.figure(figsize=(20,4))
plt.xlim(sales_train.item_price.min(),sales_train.item_price.max()*1.1)
sns.boxplot(x=sales_train.item_price)
plt.show()

去除异常值

In [None]:
sales_train = sales_train[sales_train.item_price<100000]
sales_train = sales_train[sales_train.item_cnt_day<1001]

In [None]:
print(sales_train[sales_train.item_price<0])

In [None]:
print(sales_train[sales_train.item_cnt_day<0])

In [None]:
sales_train = sales_train[sales_train.item_price > 0]
sales_train = sales_train[sales_train.item_cnt_day >= 0]

对商店数据集进行处理

In [None]:
# 查看测试集含有的商店ID
test.shop_id.sort_values().unique()

In [None]:
# 将训练集中的商店ID进行处理
sales_train.loc[sales_train['shop_id'] == 0,'shop_id'] = 57
sales_train.loc[sales_train['shop_id'] == 1,'shop_id'] = 58
sales_train.loc[sales_train['shop_id'] == 11,'shop_id'] = 10
sales_train.loc[sales_train['shop_id'] == 40,'shop_id'] = 39

In [None]:
# 获取商店所在城市以及商店类型
shops['shop_name'] = shops['shop_name'].apply(lambda x: x.lower()).str.replace('[^\w\s]', '').str.replace('\d+','').str.strip()
shops['shop_city'] = shops['shop_name'].str.partition(' ')[0]
shops['shop_type'] = shops['shop_name'].apply(lambda x: 'мтрц' if 'мтрц' in x else 'трц' if 'трц' in x else 'трк' if 'трк' in x else 'тц' if 'тц' in x else 'тк' if 'тк' in x else 'others')
shops.head()

In [None]:
shops['shop_city_code'] = LabelEncoder().fit_transform(shops['shop_city'])
shops['shop_type_code'] = LabelEncoder().fit_transform(shops['shop_type'])
shops.head()

商品信息处理

In [None]:
item_categories['split'] = item_categories['item_category_name'].str.split('-')
item_categories['item_subtype'] = item_categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())#取出小分类，若无则用大分类
print(item_categories['item_subtype'])
item_categories.head()

In [None]:
item_categories['item_type'] = item_categories['split'].map(lambda x:x[0].strip())
print(item_categories['item_type'])

In [None]:
item_categories['item_type_code'] = LabelEncoder().fit_transform(item_categories['item_type'])
item_categories['item_subtype_code'] = LabelEncoder().fit_transform(item_categories['item_subtype'])

item_categories.head()

**数据增强**

收集每一个月中，对应的shop_id和item-id，然后对每个月的商品id和商店id生成笛卡尔积

In [None]:
# 收集每一个月中，对应的shop_id和item_id，然后对每个月的商品id和商店id生成笛卡尔积
index_cols = ['shop_id', 'item_id', 'date_block_num']
df = [] 
for block_num in sales_train['date_block_num'].unique():
    cur_shops = sales_train.loc[sales_train['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales_train.loc[sales_train['date_block_num'] == block_num, 'item_id'].unique()
    df.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
df = pd.DataFrame(np.vstack(df), columns = index_cols,dtype=np.int32)

In [None]:
df.head()
df.info()

In [None]:
#添加月销售信息
train = sales_train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
train.columns = ['item_cnt_month']
train.reset_index(inplace=True)
train.head()

In [None]:
# 合并月销售数据
train = pd.merge(df, train, on=['shop_id', 'item_id', 'date_block_num'], how = "left")
train['item_cnt_month'] = (train['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # 小于0的设为0，大于20的设为20,大于20的设为20,总共34
                                .astype(np.float16))
print(train)
train.info()

In [None]:
# 将test中的信息添加进去
test['date_block_num'] = 34
cols = ['date_block_num','shop_id','item_id']
train = pd.concat([train, test[['item_id','shop_id','date_block_num']]], ignore_index=True, sort=False, keys=cols)
train.fillna(0, inplace=True) # 34 month
train.head()

**数据融合**

In [None]:
train = train.merge(items[['item_id','item_category_id']], on = ['item_id'], how = 'left')
train = train.merge(item_categories[['item_category_id','item_type_code','item_subtype_code']], on = ['item_category_id'], how = 'left')
train = train.merge(shops[['shop_id','shop_city_code','shop_type_code']], on = ['shop_id'], how = 'left')
train.head()

In [None]:
train['date_block_num'] = train['date_block_num'].astype(np.int8)
train['shop_id'] = train['shop_id'].astype(np.int8)
train['item_id'] = train['item_id'].astype(np.int16)
train['shop_city_code'] = train['shop_city_code'].astype(np.int8)
train['shop_type_code'] = train['shop_type_code'].astype(np.int8)
train['item_category_id'] = train['item_category_id'].astype(np.int8)
train['item_type_code'] = train['item_type_code'].astype(np.int8)
train['item_subtype_code'] = train['item_subtype_code'].astype(np.int8)

In [None]:
train.info()

In [None]:
print(train['item_cnt_month'].isna().sum())
print(train['item_cnt_month'].isnull().sum())

**添加月份**

In [None]:
train['month'] = train['date_block_num'] % 12 + 1 
train.head()

In [None]:
# train.to_csv('/kaggle/working/data1.csv', index=False)

**添加价格特征**

In [None]:
# 商品的均价
group = sales_train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

train = pd.merge(train, group, on=['item_id'], how='left')
train['item_avg_item_price'] = train['item_avg_item_price'].astype(np.float16)

# 商品在该月的均价
group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

train = pd.merge(train, group, on=['date_block_num','item_id'], how='left')
train['date_item_avg_item_price'] = train['date_item_avg_item_price'].astype(np.float16)

# 产生延迟信息，lags代表延迟几个月，col代表延迟的信息列名称
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

lags = [1,2,3,4,5,6,12]
train = lag_feature(train, lags, 'date_item_avg_item_price')

for i in lags:
    train['delta_price_lag_'+str(i)] = (train['date_item_avg_item_price_lag_'+str(i)] - train['item_avg_item_price']) / train['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if row['delta_price_lag_'+str(i)]:
            return row['delta_price_lag_'+str(i)]
    return 0

# 过去几个月商品的加个变化率
train['delta_price_lag'] = train.apply(select_trend, axis=1)
train['delta_price_lag'] = train['delta_price_lag'].astype(np.float16)
train['delta_price_lag'].fillna(0, inplace=True)

fetures_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']
for i in lags:
    fetures_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    fetures_to_drop += ['delta_price_lag_'+str(i)]

train.drop(fetures_to_drop, axis=1, inplace=True)
print(train)
train.info()

**保存数据**

In [None]:
train.to_pickle('/kaggle/working/data_simple.pkl')

# **模型训练**

In [None]:
# 读取数据
data = pd.read_pickle('/kaggle/working/data_simple.pkl')

In [None]:
# 划分训练集、验证集、测试集
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
train_data = lgbm.Dataset(data=X_train, label=Y_train)
valid_data = lgbm.Dataset(data=X_valid, label=Y_valid)
# 参数设置
params = {"objective" : "regression", 
          "metric" : "rmse", 
          'n_estimators':10000, 
          'early_stopping_rounds':50,
          "num_leaves" : 2**7-1, 
          "learning_rate" : 0.01, 
          "bagging_fraction" : 0.9,
          "feature_fraction" : 0.3, 
          "bagging_seed" : 0
         }
# 模型训练
lgbm_model = lgbm.train(params, 
                        train_data, 
                        valid_sets=[train_data, valid_data], 
                        verbose_eval=1000) 
# 模型保存
with open(f'/kaggle/working/lgbm_model0.pkl', 'wb') as handle:
    pickle.dump(lgbm_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# 载入数据
if os.path.isfile(f"/kaggle/working/lgbm_model0.pkl.pkl"):
    with open(f"/kaggle/working/lgbm_model0.pkl.pkl","rb") as fin:
        lgbm_model = pickle.load(fin)

In [None]:
# if os.path.isfile(f'/kaggle/working/lgbm_model0.pkl'):
#     with open(f'/kaggle/working/lgbm_model0.pkl',"rb") as fin:
#         model = pickle.load(fin)
Y_test = lgbm_model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})

submission.to_csv('/kaggle/working/lgbm_submission0.csv', index=False)