sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.   
test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.   
sample_submission.csv - a sample submission file in the correct format.   
items.csv - supplemental information about the items/products.   
item_categories.csv  - supplemental information about the items categories.   
shops.csv- supplemental information about the shops.

教训，将数据类型改为int8会导致负数的出现

<style>
code, kbd, pre, samp {
    font-family:'consolas', Lucida Console, SimSun, Fira Code, Monaco !important;
    font-size: 11pt !important;
}

div.output_area pre {
    font-family: 'consolas', Lucida Console, SimSun, Fira Code, Monaco !important;
    font-size:  10pt !important;
}

div.output_area img, div.output_area svg {
    background-color: #FFFFFF !important;
}
</style>

In [1]:
import numpy as np
import pandas as pd
import time
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
from sklearn.preprocessing import LabelEncoder
import gc
from time_series_pipeline import *
from tqdm import tqdm
from scipy import stats
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn import preprocessing, metrics
from sklearn.model_selection import TimeSeriesSplit
from itertools import product

items, shops, cats, train, test = load_data()

items has 22170 rows and 3 columns
shops has 60 rows and 2 columns
cats has 84 rows and 2 columns
train has 2935849 rows and 6 columns
test has 214200 rows and 2 columns
loading data costs 1.33 seconds


In [2]:
def data_transform(items, shops, cats, train, test):
    '''data transformation
    '''
    start = time.time()
    train = train[(train['item_price'] < 300000 ) & (train['item_cnt_day'] < 1000)]
    train = train[train['item_price'] > 0]
    median = train[(train['shop_id'] == 32)&(train['item_id'] == 2973)&(train['date_block_num'] == 4)&(train['item_price'] > 0)].item_price.median()
    train.loc[train['item_price'] < 0, 'item_price'] = median
    train.loc[train['item_cnt_day'] < 1, 'item_cnt_day'] = 0
    train.loc[train['shop_id'] == 0, 'shop_id'] = 57
    test.loc[test['shop_id'] == 0, 'shop_id'] = 57
    # Якутск ТЦ "Центральный"
    train.loc[train['shop_id'] == 1, 'shop_id'] = 58
    test.loc[test['shop_id'] == 1, 'shop_id'] = 58
    # Жуковский ул. Чкалова 39м²
    train.loc[train['shop_id'] == 10, 'shop_id'] = 11
    test.loc[test['shop_id'] == 10, 'shop_id'] = 11
    test['id'] = test['shop_id'].astype(str) + '_' + test['item_id'].astype(str)

    shops.loc[shops['shop_name'] == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
    shops['city'] = shops['shop_name'].str.split(' ').transform(lambda x: x[0])
    shops.loc[shops['city'] == '!Якутск', 'city'] = 'Якутск'
    shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
    shops = shops[['shop_id','city_code']]

    cats['split'] = cats['item_category_name'].str.split('-')
    cats['type'] = cats['split'].transform(lambda x: x[0].strip())
    cats['type_code'] = LabelEncoder().fit_transform(cats['type']) # 类型
    cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
    cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype']) # 子类型
    cats = cats[['item_category_id','type_code', 'subtype_code']]
    items.drop(['item_name'], axis = 1, inplace = True)
    ##################### 数据增强
    matrix = [] 
    cols = ['date_block_num','shop_id','item_id']
    for i in range(34):
        sales = train[train.date_block_num==i]
        matrix.append(np.array(list(product([i], 
                                        sales.shop_id.unique(), 
                                        sales.item_id.unique())), 
                                        dtype = 'int16'))
    
    matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
    matrix.sort_values(cols,inplace = True)
    matrix['id'] = matrix['shop_id'].astype(str) + '_' + matrix['item_id'].astype(str)
    ###########加入测试集
    test['date_block_num'] = 34
    test['date_block_num'] = test['date_block_num'].astype(np.int8)
    test['shop_id'] = test['shop_id'].astype(np.int8)
    test['item_id'] = test['item_id'].astype(np.int16)
    matrix = pd.concat([matrix, test], ignore_index = True, sort = False)
    #matrix.fillna(0, inplace = True)
    # 将日数据汇总为月数据
    df = pd.DataFrame() 
    grouped = train.groupby(['date_block_num','shop_id','item_id'])
    df['item_cnt_month'] = grouped['item_cnt_day'].sum()
    df.reset_index(inplace = True) 
    matrix = pd.merge(matrix, df, on = cols, how = 'left')
    matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) 
                                .astype(np.float16))
    
    matrix = pd.merge(matrix, shops, on = ['shop_id'], how = 'left')
    matrix = pd.merge(matrix, items, on = ['item_id'], how = 'left')
    matrix = pd.merge(matrix, cats, on = ['item_category_id'], how = 'left')
    
    grouped = train.groupby(['date_block_num','shop_id','item_id'])['item_price'].mean()
    grouped = pd.DataFrame(grouped)
    grouped.reset_index(inplace = True)
    matrix = pd.merge(matrix, grouped, on = ['date_block_num','shop_id','item_id'], how = 'left')
    matrix['item_price'] = matrix.groupby(['id'])['item_price'].transform(lambda x: x.fillna(x.median()))
    matrix['item_price'] = matrix['item_price'].astype(np.float32)
    del cats, grouped, items, sales, shops, test, train
    gc.collect()
    print('data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))
    print('The program costs %.2f seconds'%(time.time() - start))
    return matrix

In [3]:
df = data_transform(items, shops, cats, train, test)
del items, shops, cats, train
gc.collect()
df

data has 1609122 rows and 4 columns
The program costs 447.85 seconds


Unnamed: 0,date_block_num,shop_id,item_id,id,item_cnt_month,city_code,item_category_id,type_code,subtype_code,item_price
0,0,2,19,2_19,0.0,0,40,11,4,
1,0,2,27,2_27,1.0,0,19,5,10,2499.0
2,0,2,28,2_28,0.0,0,30,8,55,
3,0,2,29,2_29,0.0,0,23,5,16,
4,0,2,32,2_32,0.0,0,40,11,4,149.0
...,...,...,...,...,...,...,...,...,...,...
11127999,34,45,18454,45_18454,0.0,20,55,13,2,199.0
11128000,34,45,16188,45_16188,0.0,20,64,14,42,
11128001,34,45,15757,45_15757,0.0,20,55,13,2,199.0
11128002,34,45,19648,45_19648,0.0,20,40,11,4,


In [4]:
df.sort_values(by = ['shop_id','item_id','date_block_num'], inplace = True)
df.reset_index(inplace = True)

In [5]:
#df.reset_index(inplace=True)
def groupby_shift(df, col, groupcol, shift_n, fill_na = np.nan):
    '''
    apply fast groupby shift
    df: data 
    col: column need to be shift 
    shift: n
    fill_na: na filled value
    '''
    rown = df.groupby(groupcol).size().cumsum()
    rowno = list(df.groupby(groupcol).size().cumsum()) # 获取每分组第一个元素的index
    lagged_col = df[col].shift(shift_n) # 不分组滚动
    na_rows = [i for i in range(shift_n)] # 初始化为缺失值的index
    #print(na_rows)
    for i in rowno:
        if i == rowno[len(rowno)-1]: # 最后一个index直接跳过不然会超出最大index
            continue 
        else:
            new = [i + j for j in range(shift_n)] # 将每组最开始的shift_n个值变成nan
            na_rows.extend(new) # 加入列表
    na_rows = list(set(na_rows)) # 去除重复值
    na_rows = [i for i in na_rows if i <= len(lagged_col) - 1] # 防止超出最大index
    #print(na_rows)
    lagged_col.iloc[na_rows] = fill_na # 变成nan
    return lagged_col

start = time.time()

df['lag_1'] = groupby_shift(df, 'item_cnt_month', ['shop_id','item_id'], 1)
df['lag_2'] = groupby_shift(df, 'item_cnt_month', ['shop_id','item_id'], 2)
df['lag_3'] = groupby_shift(df, 'item_cnt_month', ['shop_id','item_id'], 3)
df['lag_6'] = groupby_shift(df, 'item_cnt_month', ['shop_id','item_id'], 6)
df['lag_12'] = groupby_shift(df, 'item_cnt_month', ['shop_id','item_id'], 12)

#df['shift_3_roll_avg_3'] = df['shift_3'].rolling(3).mean().astype(np.float32)
#df['shift_3_roll_avg_6'] = df['shift_3'].rolling(6).mean().astype(np.float32)
#df['shift_12_roll_avg_6'] = df['shift_12'].rolling(6).mean().astype(np.float32)

df['price_shift_1'] = groupby_shift(df, 'item_price', ['shop_id','item_id'], 1)
df['price_shift_2'] = groupby_shift(df, 'item_price', ['shop_id','item_id'], 2)
df['price_shift_3'] = groupby_shift(df, 'item_price', ['shop_id','item_id'], 3)
df['price_shift_6'] = groupby_shift(df, 'item_price', ['shop_id','item_id'], 6)
df['price_shift_12'] = groupby_shift(df, 'item_price', ['shop_id','item_id'], 12)

'''
df['mon_avg_item_cnt'] = groupby_shift(df, 'item_cnt_month', 'date_block_num', 1)
df['mon_avg_item_cnt'] = df.groupby(['date_block_num'])['mon_avg_item_cnt'].transform(lambda x: x.mean())

group = ['date_block_num', 'item_id']
df['mon_item_avg_1'] = groupby_shift(df, 'item_cnt_month', group, 1)
df['mon_item_avg_1'] = df.groupby(group)['mon_item_avg_1'].transform(lambda x: x.mean())
df['mon_item_avg_2'] = groupby_shift(df, 'item_cnt_month', group, 2)
df['mon_item_avg_2'] = df.groupby(group)['mon_item_avg_2'].transform(lambda x: x.mean())
df['mon_item_avg_6'] = groupby_shift(df, 'item_cnt_month', group, 6)
df['mon_item_avg_6'] = df.groupby(group)['mon_item_avg_6'].transform(lambda x: x.mean())

group = ['date_block_num', 'shop_id']
df['mon_shop_1'] = groupby_shift(df, 'item_cnt_month', group, 1)
df['mon_shop_1'] = df.groupby(group)['mon_shop_1'].transform(lambda x: x.mean())

group = ['date_block_num', 'shop_id', 'item_category_id']
df['mon_shop_item_1'] = groupby_shift(df, 'item_cnt_month', group, 1)
df['mon_shop_item_1'] = df.groupby(group)['mon_shop_item_1'].transform(lambda x: x.mean())

group = ['date_block_num', 'shop_id', 'subtype_code']
df['mon_shop_sub_1'] = groupby_shift(df, 'item_cnt_month', group, 1)
df['mon_shop_sub_1'] = df.groupby(group)['mon_shop_sub_1'].transform(lambda x: x.mean())

group = ['date_block_num', 'city_code']
df['mon_city_avg_1'] = groupby_shift(df, 'item_cnt_month', group, 1)
df['mon_city_avg_1'] = df.groupby(group)['mon_city_avg_1'].transform(lambda x: x.mean())

group = ['date_block_num', 'item_id', 'city_code']
df['mon_item_city_avg_1'] = groupby_shift(df, 'item_cnt_month', group, 1)
df['mon_item_city_avg_1'] = df.groupby(group)['mon_item_city_avg_1'].transform(lambda x: x.mean())

group = ['date_block_num', 'type_code']
df['mon_type_avg_1'] = groupby_shift(df, 'item_cnt_month', group, 1)
df['mon_type_avg_1'] = df.groupby(group)['mon_type_avg_1'].transform(lambda x: x.mean())

group = ['date_block_num', 'subtype_code']
df['mon_subtype_avg_1'] = groupby_shift(df, 'item_cnt_month', group, 1)
df['mon_subtype_avg_1'] = df.groupby(group)['mon_subtype_avg_1'].transform(lambda x: x.mean())
'''
#df['shift6_rolling6_mean'] = df.groupby(['id'])['item_cnt_month'].transform(lambda x: x.shift(6).rolling(6).mean())
#df['shift2_rolling2_mean'] = df.groupby(['id'])['item_cnt_month'].transform(lambda x: x.shift(2).rolling(2).mean())
#df['shift3_rolling1_mean'] = df.groupby(['id'])['item_cnt_month'].transform(lambda x: x.shift(3).rolling(1).mean())   
#df['shift3_rolling2_mean'] = df.groupby(['id'])['item_cnt_month'].transform(lambda x: x.shift(3).rolling(2).mean())
#df['shift6_rolling6_mean'] = df.groupby(['id'])['item_cnt_month'].transform(lambda x: x.shift(6).rolling(6).mean())
print('The program costs %.2f seconds'%(time.time() - start))

The program costs 80.54 seconds


In [6]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

group = df.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_cnt']
group.reset_index(inplace = True)

df = pd.merge(df, group, on = ['date_block_num'], how = 'left')
df['date_cnt'] = df['date_cnt'].astype(np.float16)
df = lag_feature(df, [1], 'date_cnt')
df.drop(['date_cnt'], axis = 1, inplace = True)

In [7]:
group = df.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item']
group.reset_index(inplace = True)

df = pd.merge(df, group, on = ['date_block_num','item_id'], how = 'left')
df['date_item'] = df['date_item'].astype(np.float16)
df = lag_feature(df, [1,2,3,6,12], 'date_item')
df.drop(['date_item'], axis = 1, inplace = True)

group = df.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop']
group.reset_index(inplace=True)

df = pd.merge(df, group, on=['date_block_num','shop_id'], how='left')
df['date_shop'] = df['date_shop'].astype(np.float16)
df = lag_feature(df, [1,2,3,6,12], 'date_shop')
df.drop(['date_shop'], axis = 1, inplace = True)

group = df.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_cat']
group.reset_index(inplace=True)

df = pd.merge(df, group, on=['date_block_num','item_category_id'], how='left')
df['date_cat'] = df['date_cat'].astype(np.float16)
df = lag_feature(df, [1], 'date_cat')
df.drop(['date_cat'], axis = 1, inplace=True)

In [8]:
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col):
            df[col].fillna(0, inplace=True) 
        if ('price' in col):
            df[col] = df[col].transform(lambda x: x.fillna(x.median()))         
    return df

df = fill_na(df)

In [9]:
df[df['id'] == '59_22088']

Unnamed: 0,index,date_block_num,shop_id,item_id,id,item_cnt_month,city_code,item_category_id,type_code,subtype_code,item_price,lag_1,lag_2,lag_3,lag_6,lag_12,price_shift_1,price_shift_2,price_shift_3,price_shift_6,price_shift_12,date_cnt_lag_1,date_item_lag_1,date_item_lag_2,date_item_lag_3,date_item_lag_6,date_item_lag_12,date_shop_lag_1,date_shop_lag_2,date_shop_lag_3,date_shop_lag_6,date_shop_lag_12,date_cat_lag_1
11127151,365149,0,59,22088,59_22088,5.0,30,83,19,64,79.0,,,,,,299.0,299.0,299.0,299.0,299.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11127152,740875,1,59,22088,59_22088,0.0,30,83,19,64,79.0,5.0,,,,,79.0,299.0,299.0,299.0,299.0,0.3479,7.289062,0.0,0.0,0.0,0.0,0.247437,0.0,0.0,0.0,0.0,3.054688
11127153,1122767,2,59,22088,59_22088,5.0,30,83,19,64,79.0,0.0,5.0,,,,79.0,79.0,299.0,299.0,299.0,0.325195,4.871094,7.289062,0.0,0.0,0.0,0.22998,0.247437,0.0,0.0,0.0,1.535156
11127154,1497438,3,59,22088,59_22088,3.0,30,83,19,64,79.0,5.0,0.0,5.0,,,79.0,79.0,79.0,299.0,299.0,0.356201,5.628906,4.871094,7.289062,0.0,0.0,0.238037,0.22998,0.247437,0.0,0.0,3.488281
11127155,1872691,4,59,22088,59_22088,6.0,30,83,19,64,79.0,3.0,5.0,0.0,,,79.0,79.0,79.0,299.0,299.0,0.275879,4.195312,5.628906,4.871094,0.0,0.0,0.169312,0.238037,0.22998,0.0,0.0,2.199219
11127156,2262678,5,59,22088,59_22088,8.0,30,83,19,64,79.0,6.0,3.0,5.0,,,79.0,79.0,79.0,299.0,299.0,0.265625,4.355469,4.195312,5.628906,0.0,0.0,0.150391,0.169312,0.238037,0.0,0.0,2.320312
11127157,2649305,6,59,22088,59_22088,7.0,30,83,19,64,79.0,8.0,6.0,3.0,5.0,,79.0,79.0,79.0,79.0,299.0,0.283691,5.675781,4.355469,4.195312,7.289062,0.0,0.185303,0.150391,0.169312,0.247437,0.0,2.576172
11127158,3010705,7,59,22088,59_22088,4.0,30,83,19,64,79.0,7.0,8.0,6.0,0.0,,79.0,79.0,79.0,79.0,299.0,0.277344,5.542969,5.675781,4.355469,4.871094,0.0,0.197266,0.185303,0.150391,0.22998,0.0,1.728516
11127159,3365128,8,59,22088,59_22088,4.0,30,83,19,64,79.0,4.0,7.0,8.0,5.0,,79.0,79.0,79.0,79.0,299.0,0.317383,3.511719,5.542969,5.675781,5.628906,0.0,0.244019,0.197266,0.185303,0.238037,0.0,1.655273
11127160,3734916,9,59,22088,59_22088,0.0,30,83,19,64,79.0,4.0,4.0,7.0,3.0,,79.0,79.0,79.0,79.0,299.0,0.308594,2.15625,3.511719,5.542969,4.195312,0.0,0.226929,0.244019,0.197266,0.169312,0.0,1.544922


In [10]:
df.columns

Index(['index', 'date_block_num', 'shop_id', 'item_id', 'id', 'item_cnt_month',
       'city_code', 'item_category_id', 'type_code', 'subtype_code',
       'item_price', 'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_12',
       'price_shift_1', 'price_shift_2', 'price_shift_3', 'price_shift_6',
       'price_shift_12', 'date_cnt_lag_1', 'date_item_lag_1',
       'date_item_lag_2', 'date_item_lag_3', 'date_item_lag_6',
       'date_item_lag_12', 'date_shop_lag_1', 'date_shop_lag_2',
       'date_shop_lag_3', 'date_shop_lag_6', 'date_shop_lag_12',
       'date_cat_lag_1'],
      dtype='object')

In [11]:

#group = ['item_id','date_block_num']
#df.sort_values(by = group, inplace = True)
#df
#df['mon_item_avg_1'] = groupby_shift(df, 'item_cnt_month',  'item_id', 1)

In [12]:
'''
df['price_shift_1'] = groupby_shift(df, 'item_price', 'id', 1)
df['price_shift_2'] = groupby_shift(df, 'item_price', 'id', 2)
df['price_shift_3'] = groupby_shift(df, 'item_price', 'id', 3)
df['price_shift_6'] = groupby_shift(df, 'item_price', 'id', 6)
df['price_shift_12'] = groupby_shift(df, 'item_price', 'id', 12)
'''

"\ndf['price_shift_1'] = groupby_shift(df, 'item_price', 'id', 1)\ndf['price_shift_2'] = groupby_shift(df, 'item_price', 'id', 2)\ndf['price_shift_3'] = groupby_shift(df, 'item_price', 'id', 3)\ndf['price_shift_6'] = groupby_shift(df, 'item_price', 'id', 6)\ndf['price_shift_12'] = groupby_shift(df, 'item_price', 'id', 12)\n"

In [13]:

'''
def recode_na(df, cols):
    #recode na value by grouped average
    #
    for i in tqdm(cols):
        df[i] = df[i].transform(lambda x: x.fillna(x.median()))
    return df

colz = ['shift_1', 'shift_2', 'shift_3', 'shift_6', 'shift_12',
       'shift_3_roll_avg_3', 'shift_3_roll_avg_6', 'shift_12_roll_avg_6',
       'price_lag_1', 'price_lag_2', 'price_lag_3', 'price_lag_6',
       'price_lag_12']

df = recode_na(df, colz)
'''
    
df['month'] = df['date_block_num'] % 12
df = df[df['date_block_num'] > 11]
print('data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))

data has 6639294 rows and 34 columns


In [14]:
df.columns

Index(['index', 'date_block_num', 'shop_id', 'item_id', 'id', 'item_cnt_month',
       'city_code', 'item_category_id', 'type_code', 'subtype_code',
       'item_price', 'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_12',
       'price_shift_1', 'price_shift_2', 'price_shift_3', 'price_shift_6',
       'price_shift_12', 'date_cnt_lag_1', 'date_item_lag_1',
       'date_item_lag_2', 'date_item_lag_3', 'date_item_lag_6',
       'date_item_lag_12', 'date_shop_lag_1', 'date_shop_lag_2',
       'date_shop_lag_3', 'date_shop_lag_6', 'date_shop_lag_12',
       'date_cat_lag_1', 'month'],
      dtype='object')

In [19]:
features = ['date_block_num',
            'month',
            'shop_id',
            'item_id',
            'city_code', 
            'item_category_id', 
            'type_code', 
            'subtype_code',
            'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_12',
       'price_shift_1', 'price_shift_2', 'price_shift_3', 'price_shift_6',
       'price_shift_12', 'date_cnt_lag_1', 'date_item_lag_1',
       'date_item_lag_2', 'date_item_lag_3', 'date_item_lag_6',
       'date_item_lag_12', 'date_shop_lag_1', 'date_shop_lag_2',
       'date_shop_lag_3', 'date_shop_lag_6', 'date_shop_lag_12',
       'date_cat_lag_1']
cat_features = ['month', 'shop_id','item_id','city_code', 'item_category_id', 'type_code', 'subtype_code']

https://catboost.ai/docs/concepts/parameter-tuning.html
https://catboost.ai/docs/concepts/python-reference_catboostregressor.html

In [72]:
def train_catboost(df):
    '''train a catboost
    '''
    df.sort_values(['date_block_num','shop_id','item_id'], inplace = True)
    x_train = df[df['date_block_num'] < 34]
    y_train = x_train['item_cnt_month'].astype(np.float32)
    test = df[df['date_block_num'] == 34]
    
    folds = TimeSeriesSplit(n_splits = 3) # use TimeSeriesSplit cv
    splits = folds.split(x_train, y_train)
    val_pred = np.zeros(len(x_train))
    test_pred = np.zeros(len(test))
    for fold, (trn_idx, val_idx) in enumerate(splits):
        print(f'Training fold {fold + 1}')
          
        train_set = x_train.iloc[trn_idx][features]
        y_tra = y_train.iloc[trn_idx]
        val_set = x_train.iloc[val_idx][features]
        y_val = y_train.iloc[val_idx]

        model = CatBoostRegressor(iterations = 1500,
                              learning_rate = 0.03,
                              depth = 5,
                              loss_function = 'RMSE',
                              eval_metric = 'RMSE',
                              random_seed = 42,
                              bagging_temperature = 0.3,
                              od_type = 'Iter',
                              metric_period = 50,
                              od_wait = 28)
        model.fit(train_set, y_tra, 
              eval_set = (val_set, y_val),
              use_best_model = True, 
              cat_features = cat_features,
              verbose = 50)
        
        val_pred[val_idx] = model.predict(x_train.iloc[val_idx][features]) # prediction
        #test_pred += model.predict(test[features]) / 3 # calculate mean prediction value of 3 models
        print('-' * 50)
        print('\n')
    test_pred = model.predict(test[features])  
    val_rmse = np.sqrt(metrics.mean_squared_error(y_train, val_pred))
    print('Our out of folds rmse is {:.4f}'.format(val_rmse))
    return test_pred

def train_lightgbm(df):
    '''train a lightgbm
    '''
    df.sort_values(['date_block_num','shop_id','item_id'], inplace = True)
    x_train = df[df['date_block_num'] < 34]
    y_train = x_train['item_cnt_month'].astype(np.float32)
    test = df[df['date_block_num'] == 34]
    
    folds = TimeSeriesSplit(n_splits = 3) # use TimeSeriesSplit cv
    splits = folds.split(x_train, y_train)
    val_pred = np.zeros(len(x_train))
    test_pred = np.zeros(len(test))
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'rmse', # loss function
        'seed': 225,
        'learning_rate': 0.01,
        'lambda': 0.4, # l2 regularization
        'reg_alpha': 0.4, # l1 regularization
        'max_depth': 4, # max depth of decision trees
        'num_leaves': 68, #  number of leaves
        'bagging_fraction': 0.7, # bootstrap sampling
        'bagging_freq' : 1,
        'colsample_bytree': 0.7 # feature sampling
    }
    for fold, (trn_idx, val_idx) in enumerate(splits):
        print(f'Training fold {fold + 1}')
        
        train_set = lgb.Dataset(x_train.iloc[trn_idx][features], 
                                y_train.iloc[trn_idx], 
                                categorical_feature = cat_features)
        
        val_set = lgb.Dataset(x_train.iloc[val_idx][features], 
                              y_train.iloc[val_idx], 
                              categorical_feature = cat_features)

        model = lgb.train(params, train_set, 
                          num_boost_round = 1500, 
                          early_stopping_rounds = 100, 
                          valid_sets = [val_set], 
                          verbose_eval = 50)
        
        val_pred[val_idx] = model.predict(x_train.iloc[val_idx][features]) # prediction
        test_pred += model.predict(test[features]) / 3 # calculate mean prediction value of 3 models
        print('-' * 50)
        print('\n')
    #test_pred = model.predict(test[features])     
    val_rmse = np.sqrt(metrics.mean_squared_error(y_train, val_pred))
    print('Our out of folds rmse is {:.4f}'.format(val_rmse))
    return test_pred

In [None]:
test_pred = train_lightgbm(df)
#test_pred_lgb = train_catboost(df)

Training fold 1




Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 1.0406
[100]	valid_0's rmse: 0.97855
[150]	valid_0's rmse: 0.951921
[200]	valid_0's rmse: 0.937652
[250]	valid_0's rmse: 0.930796
[300]	valid_0's rmse: 0.927815
[350]	valid_0's rmse: 0.925509
[400]	valid_0's rmse: 0.924499
[450]	valid_0's rmse: 0.923843
[500]	valid_0's rmse: 0.923265
[550]	valid_0's rmse: 0.92245
[600]	valid_0's rmse: 0.921944
[650]	valid_0's rmse: 0.921622
[700]	valid_0's rmse: 0.921101
[750]	valid_0's rmse: 0.921039
[800]	valid_0's rmse: 0.921003
[850]	valid_0's rmse: 0.920358
[900]	valid_0's rmse: 0.920399
[950]	valid_0's rmse: 0.920236
[1000]	valid_0's rmse: 0.920545
Early stopping, best iteration is:
[933]	valid_0's rmse: 0.920004
--------------------------------------------------


Training fold 2
Training until validation scores don't improve for 100 rounds


In [70]:
def make_output(test_pred):
    '''make prediction
    '''
    test  = pd.read_csv('../data/test.csv')
    test.sort_values(['shop_id','item_id'], inplace = True)
    submission = pd.DataFrame({'ID': test['ID'],
                              'item_cnt_month': test_pred.clip(0,20)})
    #submission = pd.DataFrame({'ID': range(0,len(test_pred)),'item_cnt_month': test_pred.clip(0,20)})
    print(submission.head(15))
    submission.to_csv('../output/cat_submission.csv', index = False)
    return submission
submission = make_output(test_pred_lgb)

          ID  item_cnt_month
22987  22987        0.147674
20994  20994        0.208000
20995  20995        0.199253
22492  22492        0.176417
22491  22491        0.147674
22490  22490        0.147674
22489  22489        0.147674
22091  22091        0.147674
24827  24827        0.147674
22488  22488        0.147674
23484  23484        0.147674
22419  22419        0.176417
20928  20928        0.147674
20996  20996        0.162454
24963  24963        0.162454


In [69]:
submission['item_cnt_month'].isna().value_counts()

False    214200
Name: item_cnt_month, dtype: int64

In [64]:
len(submission)

214200

```
Index               67512848
date                67512848
date_block_num       8439106
shop_id              8439106
item_id             16878212
item_price          33756424
item_cnt_day         8439106
city_code            8439106
item_category_id     8439106
type_code            8439106
subtype_code         8439106
lag_1               67512848
dtype: int64
```

```
Training fold 1
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 1.1757494	test: 1.1575727	best: 1.1575727 (0)	total: 1.43s	remaining: 35m 40s
50:	learn: 0.8640184	test: 0.9677260	best: 0.9677260 (50)	total: 48.5s	remaining: 22m 57s
100:	learn: 0.7881810	test: 0.9398471	best: 0.9398471 (100)	total: 1m 40s	remaining: 23m 10s
150:	learn: 0.7679923	test: 0.9313066	best: 0.9313066 (150)	total: 2m 47s	remaining: 24m 53s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9302290542
bestIteration = 158

Shrink model to first 159 iterations.
--------------------------------------------------


Training fold 2
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 1.1660820	test: 1.2842634	best: 1.2842634 (0)	total: 2.38s	remaining: 59m 29s
50:	learn: 0.8826853	test: 1.0427823	best: 1.0427823 (50)	total: 1m 40s	remaining: 47m 28s
100:	learn: 0.7966804	test: 1.0309959	best: 1.0309959 (100)	total: 2m 51s	remaining: 39m 35s
150:	learn: 0.7745665	test: 1.0159513	best: 1.0159513 (150)	total: 4m 40s	remaining: 41m 48s
200:	learn: 0.7637740	test: 1.0119380	best: 1.0118339 (199)	total: 6m 27s	remaining: 41m 42s
250:	learn: 0.7561012	test: 1.0092657	best: 1.0092657 (250)	total: 8m 26s	remaining: 42m 1s
300:	learn: 0.7493929	test: 1.0079956	best: 1.0075532 (291)	total: 10m 20s	remaining: 41m 11s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.007553196
bestIteration = 291

Shrink model to first 292 iterations.
--------------------------------------------------


Training fold 3
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 1.2064768	test: 1.0777938	best: 1.0777938 (0)	total: 3.54s	remaining: 1h 28m 33s
50:	learn: 0.9318607	test: 0.9035508	best: 0.9035508 (50)	total: 2m 22s	remaining: 1h 7m 20s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9035508379
bestIteration = 50

Shrink model to first 51 iterations.
--------------------------------------------------


Our out of folds rmse is 1.0239
```

In [None]:
的
df['lag_t1'] = df.groupby(['shop_id','item_id'])['item_cnt_day'].transform(lambda x: x.shift(30))



In [None]:
date = df.groupby(['shop_id','item_id'])['date']
date = pd.DataFrame(date)
date

In [None]:
df['lag_t1_rolling'] = df.groupby(['shop_id','item_id'])['item_cnt_day'].transform(lambda x: x.shift(30).rolling(30).mean())

In [None]:
df['lag_3'].isna().value_counts()

In [None]:
df['lag_t7'] = df.groupby(['shop_id', 'item_id'])['item_cnt_day'].transform(lambda x: x.shift(7))
df
df['lag_t7'].isna().value_counts()

测试集是34个月内某些商店和某些物品的乘积。 有5100个商品 * 42个商店 = 214200对。 与训练集相比，有363件新商品。 因此，对于测试集中的大多数项目，目标值应为零。 另一方面，训练集仅包含过去出售或退回的货币对。 主要思想是计算月销售额，并在一个月内将每个唯一对的零销售额扩展为零。 这样，训练数据将类似于测试数据。

In [None]:
df

In [None]:
data = pd.read_pickle('../data/data.pkl')
data
# 6639294 rows × 40 columns

In [None]:
del data
gc.collect()

In [None]:
data.columns
features = [
    'date_block_num',
    'shop_id',
    'item_id',
    #'item_cnt_month',
    'city_code',
    'item_category_id',
    'type_code',
    'subtype_code',
    'item_cnt_month_lag_1',
    'item_cnt_month_lag_2',
    'item_cnt_month_lag_3',
    'item_cnt_month_lag_6',
    'item_cnt_month_lag_12',
    'date_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_2',
    'date_item_avg_item_cnt_lag_3',
    'date_item_avg_item_cnt_lag_6',
    'date_item_avg_item_cnt_lag_12',
    'date_shop_avg_item_cnt_lag_1',
    'date_shop_avg_item_cnt_lag_2',
    'date_shop_avg_item_cnt_lag_3',
    'date_shop_avg_item_cnt_lag_6',
    'date_shop_avg_item_cnt_lag_12',
    'date_cat_avg_item_cnt_lag_1',
    'date_shop_cat_avg_item_cnt_lag_1',
    'date_shop_type_avg_item_cnt_lag_1',
    'date_shop_subtype_avg_item_cnt_lag_1',
    'date_city_avg_item_cnt_lag_1',
    'date_item_city_avg_item_cnt_lag_1',
    'date_type_avg_item_cnt_lag_1',
    'date_subtype_avg_item_cnt_lag_1',
    'delta_price_lag',
    'month',
    'days',
    'item_shop_last_sale',
    'item_last_sale',
    'item_shop_first_sale',
    'item_first_sale',
]
cat_features = ['date_block_num',
                'month', 
                'shop_id',
                'item_id',
                'city_code',
                'item_category_id',
                'type_code', 
                'subtype_code']
#data
#data['id'] = data['shop_id'].astype(str) + '_' + test_indicate['item_id'].astype(str)
data.sort_values(['date_block_num','shop_id','item_id'],inplace = True)

In [None]:
'''
x_train = data[data['date_block_num'] < 34]
y_train = x_train['item_cnt_month'].astype(np.float32)
test = data[data['date_block_num'] == 34]

#need_to_remove = ['item_cnt_day','city_code','item_category_id',
                 # 'type_code','subtype_code', 'shop_id', 'item_id', 'id']
#features = [i for i in list(df.columns) if i not in need_to_remove]
#n_fold = 3 #3 for timely purpose of the kernel
folds = TimeSeriesSplit(n_splits = 3) # use TimeSeriesSplit cv
splits = folds.split(x_train, y_train)
val_pred = np.zeros(len(x_train))
test_pred = np.zeros(len(test))
for fold, (trn_idx, val_idx) in enumerate(splits):
    print(f'Training fold {fold + 1}')
          
    train_set = x_train.iloc[trn_idx][features]
    y_tra = y_train.iloc[trn_idx]
    val_set = x_train.iloc[val_idx][features]
    y_val = y_train.iloc[val_idx]

    model = CatBoostRegressor(iterations = 500,
                              learning_rate = 0.05,
                              depth = 6,
                              eval_metric = 'RMSE',
                              random_seed = 42,
                              bagging_temperature = 0.2,
                              od_type = 'Iter',
                              metric_period = 50,
                              od_wait = 20)
    model.fit(train_set, y_tra, 
              eval_set = (val_set, y_val),
              use_best_model = True, 
              cat_features = cat_features,
              verbose = 50)
        
    val_pred[val_idx] = model.predict(x_train.iloc[val_idx][features]) # prediction
    test_pred += model.predict(test[features]) / 3 # calculate mean prediction value of 3 models
    print('-' * 50)
    print('\n')
          
val_rmse = np.sqrt(metrics.mean_squared_error(y_train, val_pred))
print('Our out of folds rmse is {:.4f}'.format(val_rmse))
'''

In [None]:
test_pred_lgb = train_lightgbm(data)