In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

import gc
import time
import pickle
import seaborn as sns
from tqdm import tqdm
from itertools import product
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn import preprocessing 
from xgboost import plot_importance
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

import sys
sys.version_info

## 读入保存好的数据

In [None]:
data = pd.read_pickle('/kaggle/input/all-datta5/data2.pkl')
# data = data[[
#     'date_block_num', 
#     'shop_id', 
#     'item_id', 
#     'item_cnt_month',
    
# #     'shop_city', 
# #     'shop_name1',
# #     'shop_type', 
    
# #     'name_1', 
# #     'name_2', 
# #     'name_3',
    
# #     'item_type', 
# #     'item_subtype', 
# #     'item_category_id', 
    
# #     'item_cnt_month_lag_1',
# #     'item_cnt_month_lag_2', 
# #     'item_cnt_month_lag_3',
# #     'date_block_num_avg_item_cnt_lag_1',
# #     'date_block_num_and_item_id_avg_item_cnt_lag_1',
# #     'date_block_num_and_item_id_avg_item_cnt_lag_2',
# #     'date_block_num_and_item_id_avg_item_cnt_lag_3',
# #     'date_block_num_and_shop_id_avg_item_cnt_lag_1',
# #     'date_block_num_and_shop_id_avg_item_cnt_lag_2',
# #     'date_block_num_and_shop_id_avg_item_cnt_lag_3',
    
# #     'date_block_num_and_shop_city_avg_item_cnt_lag_1',
# #     'date_block_num_and_shop_name1_avg_item_cnt_lag_1',
# #     'date_block_num_and_shop_type_avg_item_cnt_lag_1',
    
# #     'date_block_num_and_item_category_id_avg_item_cnt_lag_1',
# #     'date_block_num_and_item_type_avg_item_cnt_lag_1',
# #     'date_block_num_and_item_subtype_avg_item_cnt_lag_1',
    
# #     'date_block_num_and_shop_id_and_item_id_avg_item_cnt_lag_1',
# #     'date_block_num_and_shop_id_and_name_1_avg_item_cnt_lag_1',
# #     'date_block_num_and_shop_id_and_name_2_avg_item_cnt_lag_1',
# #     'date_block_num_and_shop_id_and_name_3_avg_item_cnt_lag_1',
# #     'date_block_num_and_shop_id_and_item_category_id_avg_item_cnt_lag_1',
    
# #     'delta_price_lag', 
# #     'item_shop_last_sale', 
# #     'item_last_sale',
# #     'item_first_sale', 
# #     'year', 
# #     'month', 
# #     'days'
# ]]

print(data.shape)
data.head(3)

In [None]:
##求出每个月的均值销售额##shop_mean_months
a = []
for i in range(3,34):
    b = data[data.date_block_num==i]##取到每一个月
    c = b.sum()['item_cnt_month']#求和
    d = len(b.shop_id.unique())#长度（个数）
    a.append(c/d)#求得均值
print(a)

In [None]:
len(a)

In [None]:
import matplotlib.pyplot as plt
from pylab import *                                 #支持中文

# ax.plot(range(4,34), a, marker='o', mec='r', mfc='w',label='shop_mean_month')
# ax.legend()  # 让图例生效

plt.subplots(1,1,figsize=(14,10))
plt.plot(range(4,34), a, marker='o', mec='r', mfc='w',label='shop_mean_month')
plt.legend()  # 让图例生效
plt.xlabel('month') #X轴标签
plt.ylabel("shop_mean_month") #Y轴标签
plt.title("shop_mean_month") #标题

plt.show()

In [None]:
##添加周特征，周的每一天的特征
weekarr = []
t = 2
count = 0
for w in range(3):
    for i in [31,28,31,30,31,30,31,31,30,31,30,31]:
        a = [0,0,0,0,0,0,0,count]
        count+=1
        for j in range(i):
            a[t]+=1
            if t==6:
                t=-1
            t+=1
        weekarr.append(a)
weekarr = pd.DataFrame(np.vstack(weekarr), columns=['week0','week1','week2','week3','week4','week5','week6','date_block_num'])
data = pd.merge(data, weekarr, on=['date_block_num'], how='left')#加进去
del weekarr
gc.collect()

In [None]:
##数据集的划分
X_zong = data.drop(['item_cnt_month'], axis=1)#去掉标签
Y_train = data[data.date_block_num < 33]['item_cnt_month']#训练集的标签
Y_valid = data[data.date_block_num == 33]['item_cnt_month']#交叉验证的标签
del data##删除数据集减少占用内存
gc.collect()##垃圾回收机制

In [None]:
from sklearn.preprocessing import MinMaxScaler ##数据归一化
minMax = MinMaxScaler()    


X_zong_std = minMax.fit_transform(X_zong.iloc[:,:-7])  ##取所有的行，除了后七列的所有的
X_zong.iloc[:,:-7] = pd.DataFrame(np.vstack(X_zong_std),columns=X_zong.columns[:-7])
X_zong.iloc[:,:-7] = downcast_dtypes(X_zong.iloc[:,:-7])#转换数据类型，为了减少内存

##这就是排除出去的后七列
X_zong['week0'] = X_zong['week0'].astype(np.int8)
X_zong['week1'] = X_zong['week1'].astype(np.int8)
X_zong['week2'] = X_zong['week2'].astype(np.int8)
X_zong['week3'] = X_zong['week3'].astype(np.int8)
X_zong['week4'] = X_zong['week4'].astype(np.int8)
X_zong['week5'] = X_zong['week5'].astype(np.int8)
X_zong['week6'] = X_zong['week6'].astype(np.int8)

del X_zong_std
gc.collect()

In [None]:
X_train = X_zong[X_zong.date_block_num < 0.96679688]
X_valid = X_zong[X_zong.date_block_num == 0.96679688]
X_test = X_zong[X_zong.date_block_num == 1]
del X_zong
gc.collect()

## 模型的训练--lightgbm、catboost ，集成（堆叠）

In [None]:
##参数可以自己改
ts = time.time()

model = XGBRegressor(
    max_depth=10,
    n_estimators=1000,
    min_child_weight=0.5, 
    colsample_bytree=0.9, 
    subsample=0.8, 
    eta=0.1,    
    seed=1)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

In [None]:
plot_features(model, (10,14))

## 保存为自己所需要的数据

In [None]:
Y_test = model.predict(X_test).clip(0, 20)##clip0-20之间
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission1.csv', index=False)