In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.figure as figure

In [None]:
#データの読み込み

train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
shops_e = pd.read_csv('../input/predict-future-sales-eng-translation/shops.csv')

items= pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
items_e= pd.read_csv('../input/predict-future-sales-eng-translation/items.csv')

cats= pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
cats_e= pd.read_csv('../input/predict-future-sales-eng-translation/categories.csv')

In [None]:
# 外れ値の除外
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

In [None]:
#item_priceが負の値のデータを処理する
median = train[(train.date_block_num==4)&(train.shop_id==32)&(train.item_id==2973)&(train.item_price>0)].item_price.median()
# median を0以下の値に代入
train.loc[train.item_price<0, 'item_price'] = median
train[train.item_price<0]

## 店舗情報についての特徴量生成

In [None]:
# 重複している店名のIDを統一
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11
train.loc[train.shop_id == 39, 'shop_id'] = 40
test.loc[test.shop_id == 39, 'shop_id'] = 40

In [None]:
#都市名を抽出
from sklearn.preprocessing import LabelEncoder

shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
# !がゴミ
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
# LabelEncoder
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops

In [None]:
#人口情報に関してはcity_info.pklにまとめてくれていた人がいたのでそのまま使用
city_info = pd.read_pickle('../input/predict-future-sales/city_info.pkl')

In [None]:
#緯度経度の情報を入れる
city_info['longtitude'] = (np.NAN, 45, 55, 50, 59, 52, np.NAN, 56, 56, 54, 55, 56, 52, 56, 56, 56, 55, 55, 47, 60, 53, 56, 61, 56, 57, 55, 56, 55, 62, 58)
city_info['latitude'] = (np.NAN, 40, 37, 45, 40, 39, np.NAN, 38, 48, 35, 39, 83, 36, 37, 38, 44, 83, 73, 39, 29, 50, 38, 73, 85, 65, 56, 37, 37, 130, 40)

In [None]:
#各種特徴量をshopsにつくる
shops['city_size'] = shops['city'].map(city_info['city_size'])
shops['longtitude'] = shops['city'].map(city_info['longtitude'])
shops['latitude'] = shops['city'].map(city_info['latitude'])
shops

In [None]:
#ロシアを六地点に分けて最も近い気象の情報を入れ込む(http://www.data.jma.go.jp/gmd/cpd/monitor/index.html)
place = pd.read_csv('../input/predict-future-sale-climate/place.csv', index_col = 0)
place #気象情報をとってきた場所の位置情報

In [None]:
#気象データのある６地点のうちユークリッド距離が最小のもののplace_IDを取得する
place_ids = []
for i in range(shops.shape[0]):
    distances = []
    for j in range(place.shape[0]):
        distance = np.sqrt((shops.iloc[i]['longtitude'] - place.iloc[j]['longtitude']) ** 2 + (shops.iloc[i]['latitude'] - place.iloc[j]['latitude']) ** 2)
        distances.append(distance)
    place_id = distances.index(min(distances)) + 1
    place_ids.append(place_id)

In [None]:
shops['place_ID'] = place_ids
shops

In [None]:
#Cityの空欄はネットショップと訪問販売なのでワンホットエンコーディングを行う、city_sizeは0埋め
shops['Internet'] = 0
shops['Internet'][shops['city'] == 'Интернет-магазин'] = 1
shops['Internet'][shops['city'] == 'Цифровой'] = 1
shops['Visit'] = 0
shops['Visit'][shops['city'] == 'Выездная'] = 1

#place_IDも0で埋める
shops['place_ID'][shops['Internet'] == 1] = 0
shops['place_ID'][shops['Visit'] == 1] = 0

In [None]:
#必要なカラムのみ抜き出し
shops = shops[['shop_id', 'city_code', 'city_size', 'longtitude', 'latitude', 'place_ID', 'Internet', 'Visit']]
shops.fillna(0, inplace = True)
shops

In [None]:
#英語版の店舗名を見てみる
shops_e['shop_name'] = shops_e['shop_name'].str.lower()
shops_e['shop_name'] = shops_e['shop_name'].str.replace(r'[^\w\d\s]', ' ')
shops_e

In [None]:
#center, mall, megaが含まれるものは大きな店舗と考える
shops_e['shop_type'] = 'normal'
shops_e.loc[shops_e['shop_name'].str.contains(r'mall|center|mega'), 'shop_type'] = 'mall'
#shop_id20は特殊な店舗
shops_e.loc[shops_e['shop_id'].isin([20]), 'shop_type'] = 'sale'
shops['mall'] = 0
shops['mall'][shops_e['shop_type'] == 'mall'] = 1
shops['sale'] = 0
shops['sale'][shops_e['shop_type'] == 'sale'] = 1

In [None]:
#shop_typeでまとめる
shops['shop_type'] = 0
shops['shop_type'][shops['Internet'] == 1] = 1
shops['shop_type'][shops['Visit'] == 1] = 2
shops['shop_type'][shops['sale'] == 1] = 2
shops['shop_type'][shops['mall'] == 1] = 3

In [None]:
#使用するカラムのみ
shops = shops[['shop_id', 'city_code', 'city_size', 'longtitude', 'latitude', 'place_ID', 'shop_type']]
shops

In [None]:
#開店時期と閉店時期を確認

sales_by_shop_id = train.pivot_table(index=['shop_id'],values=['item_cnt_day'], 
                                        columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)
sales_by_shop_id = sales_by_shop_id.reset_index(drop=True).rename_axis(None, axis=1)
sales_by_shop_id.columns.values[0] = 'shop_id'

for i in range(3,34):
    print('Not exists in month',i,sales_by_shop_id['shop_id'][sales_by_shop_id.loc[:,'0':str(i)].sum(axis=1)==0].unique())

for i in range(3,28):
    print('Shop is outdated for month',i,sales_by_shop_id['shop_id'][sales_by_shop_id.loc[:,str(i):].sum(axis=1)==0].unique())

In [None]:
#店の開店時期と閉店時期を入力
shops['opening_block'] = -10
shops['closing_block'] = -10
shops['opening_block'][shops['shop_id'] == 5] = 1
shops['closing_block'][shops['shop_id'] == 8] = 2
shops['closing_block'][shops['shop_id'] == 13] = 19
shops['opening_block'][shops['shop_id'] == 17] = 5
shops['closing_block'][shops['shop_id'] == 17] = 24
shops['closing_block'][shops['shop_id'] == 23] = 3
shops['closing_block'][shops['shop_id'] == 27] = 30
shops['closing_block'][shops['shop_id'] == 29] = 28
shops['closing_block'][shops['shop_id'] == 30] = 25
shops['closing_block'][shops['shop_id'] == 32] = 6
shops['opening_block'][shops['shop_id'] == 33] = 19
shops['closing_block'][shops['shop_id'] == 33] = 26
shops['opening_block'][shops['shop_id'] == 34] = 18
shops['opening_block'][shops['shop_id'] == 36] = 33
shops['opening_block'][shops['shop_id'] == 40] = 14
shops['closing_block'][shops['shop_id'] == 43] = 24
shops['opening_block'][shops['shop_id'] == 48] = 15
shops['opening_block'][shops['shop_id'] == 49] = 11
shops['closing_block'][shops['shop_id'] == 49] = 32
shops['closing_block'][shops['shop_id'] == 54] = 27
shops['closing_block'][shops['shop_id'] == 55] = 4
shops

## カテゴリ変数について特徴量生成を行う

In [None]:
# '-'でカテゴリ名を分割、typeとsubtypeに代入
cats_e['split'] = cats_e['category_name'].str.split('-')
cats_e['type'] = cats_e['split'].map(lambda x: x[0].strip())
cats_e['subtype'] = cats_e['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())

In [None]:
#データを見て少しまとめる
cats_e['type'][cats_e['type'] == 'Payment cards'] = 'Payment Cards'
cats_e['type'][cats_e['type'] == 'Payment Cards (Cinema, Music, Games)'] = 'Payment Cards'
cats_e['subtype'][cats_e['category_id'] == 38] = 'BluRay3D'
cats_e['subtype'][cats_e['category_id'] == 39] = 'BluRay4K'

In [None]:
# LabelEncoderを使って数値化
cats_e['type_code'] = LabelEncoder().fit_transform(cats_e['type'])
cats_e['subtype_code'] = LabelEncoder().fit_transform(cats_e['subtype'])
cats_e

In [None]:
# これ以上は難しそうなので終了、shopsの構造を['item_category_id', 'type_code', 'subtype_code']に設定する
cats_e = cats_e[['category_id','type_code', 'subtype_code']]
cats_e = cats_e.rename(columns = {'category_id': 'item_category_id'})

商品情報を調べる

In [None]:
items_e['item_name'] = items_e['item_name'].str.lower()
items_e['item_name'] = items_e['item_name'].replace('.', '')
for i in [r'[^\w\d\s\.]', r'\bthe\b', r'\bin\b', r'\bis\b',
          r'\bfor\b', r'\bof\b', r'\bon\b', r'\band\b',  
          r'\bto\b', r'\bwith\b' , r'\byo\b']:
    items_e['item_name'] = items_e['item_name'].str.replace(i, ' ')
items_e['item_name'] = items_e['item_name'].str.replace(r'\b.\b', ' ')
items_e['item_name_no_space'] = items_e['item_name'].str.replace(' ', '')
items_e

In [None]:
#商品名が似ているものは似た売れ方をする可能性があると考え最初の数文字や最後の数文字を抜き出し
items['item_name_first3_e'] = [x[:3] for x in items_e['item_name_no_space']]
items['item_name_first5_e'] = [x[:5] for x in items_e['item_name_no_space']]
items['item_name_first8_e'] = [x[:8] for x in items_e['item_name_no_space']]
items['item_name_last5_e'] = [x[-5:] for x in items_e['item_name_no_space']]

In [None]:
#ラベルエンコーディング
items.item_name_first3_e = LabelEncoder().fit_transform(items.item_name_first3_e.values)
items.item_name_first5_e = LabelEncoder().fit_transform(items.item_name_first5_e.values)
items.item_name_first8_e = LabelEncoder().fit_transform(items.item_name_first8_e.values)
items.item_name_last5_e = LabelEncoder().fit_transform(items.item_name_last5_e.values)

In [None]:
#不要なので削除
items.drop(['item_name'], axis=1, inplace=True)

## trainデータに関して特徴量生成を行う

In [None]:
#それぞれのitemが初めて売れた日を同定する
train['date'] = pd.to_datetime(train.date,format='%d.%m.%Y')
train['first_sale_day'] = train.date.dt.dayofyear 
train['first_sale_day'] += 365 * (train.date.dt.year-2013)
train['first_sale_day'] = train.groupby('item_id')['first_sale_day'].transform('min').astype('int16')
train

In [None]:
#あとで使うので１日目から数えた最初の月を同定する
dates = pd.DataFrame(data={'date':pd.date_range(start='2013-01-01',end='2015-11-30')})
dates['month'] = dates.date.dt.month
dates['year'] = dates.date.dt.year - 2013
dates['date_block_num'] = dates['year']*12 + dates['month'] - 1
dates['first_day_of_month'] = dates.date.dt.dayofyear
dates['first_day_of_month'] += 365 * dates['year']
dates = dates.groupby(['date_block_num','month','year']).agg({'first_day_of_month':'min'}).reset_index()
dates

In [None]:
#学習用のデータセットに変換する.data_block_num・shop_id及びitem_idでまとめる.

from itertools import product

matrix = []
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))

# 列名を改めて設定してmatrixを更新
cols = ['date_block_num','shop_id','item_id']
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)

In [None]:
#店舗ごとの売り上げを計算(最終的には使用しなかった)
train['shop_sale'] = train['item_cnt_day'] * train['item_price']
train

In [None]:
# 月の売上個数や店舗ごとの売上額を計算
sale_number = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum'], 'first_sale_day': ['first'], 'shop_sale': ['sum']})
sale_number.columns = ['item_cnt_month', 'first_sale_day', 'shop_sale']
sale_number.reset_index(inplace=True)
sale_number.head()

In [None]:
# date_block_num = 34としてテストデータを追加
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) 

In [None]:
# DataFrame同士でcolsを条件に左結合する
matrix = pd.merge(matrix, sale_number, on=cols, how='left')
# item_cnt_monthの前処理今回は最初からclipを行う
matrix['item_cnt_month'] = (matrix['item_cnt_month'].fillna(0).astype(np.float16))
matrix.fillna(0, inplace = True)
matrix.info()

In [None]:
# Shop/Cat/Itemの特徴量をmatrixに追加する
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats_e, on=['item_category_id'], how='left')

matrix.head()

In [None]:
# 型のキャスト
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['first_sale_day'] = matrix['first_sale_day'].astype(np.int8)
matrix['ID'] = matrix['ID'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)
matrix['longtitude'] = matrix['longtitude'].astype(np.int8)
matrix['latitude'] = matrix['latitude'].astype(np.int8)
matrix['city_size'] = matrix['city_size'].astype(np.float16)
matrix['shop_type'] = matrix['shop_type'].astype(np.int8)
matrix['place_ID'] = matrix['place_ID'].astype(np.int8)
matrix['shop_sale'] = matrix['shop_sale'].astype(np.float16)
matrix['opening_block'] = matrix['opening_block'].astype(np.int8)
matrix['closing_block'] = matrix['closing_block'].astype(np.int8)
matrix['item_name_first3_e'] = matrix['item_name_first3_e'].astype(np.int16)
matrix['item_name_first5_e'] = matrix['item_name_first5_e'].astype(np.int16)
matrix['item_name_first8_e'] = matrix['item_name_first8_e'].astype(np.int16)
matrix['item_name_last5_e'] = matrix['item_name_last5_e'].astype(np.int16)

print(matrix.info())

In [None]:
#後で使用するため月の始めの日を導入
matrix = pd.merge(matrix, dates[['date_block_num', 'first_day_of_month']], on=['date_block_num'], how='left')

In [None]:
#初めて売れてから何日たっているかを特徴量化
matrix['first_sale_day'] = matrix.groupby('item_id')['first_sale_day'].transform('max').astype('int16')
matrix.loc[matrix['first_sale_day']==0, 'first_sale_day'] = 1035
matrix['prev_days_on_sale'] = [max(idx) for idx in zip(matrix['first_day_of_month']-matrix['first_sale_day'],[0]*len(matrix))]
matrix['prev_days_on_sale'] = matrix['prev_days_on_sale'].astype(np.int8)
matrix.drop('first_day_of_month', axis = 1, inplace = True)
matrix

In [None]:
#各アイテムの発売されてからの経過月
matrix['item_month'] = (matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')).astype('int8')
#アイテムの名前の共通部分でグループ化して、グループごとの経過月
matrix['item_name_first3_age'] = (matrix['date_block_num'] - matrix.groupby('item_name_first3_e')['date_block_num'].transform('min')).astype('int8')
matrix['item_name_first5_age'] = (matrix['date_block_num'] - matrix.groupby('item_name_first5_e')['date_block_num'].transform('min')).astype('int8')
matrix['item_name_first8_age'] = (matrix['date_block_num'] - matrix.groupby('item_name_first8_e')['date_block_num'].transform('min')).astype('int8')
matrix['item_name_last5_age'] = (matrix['date_block_num'] - matrix.groupby('item_name_last5_e')['date_block_num'].transform('min')).astype('int8')

In [None]:
#カレンダー情報を追加
calendar = pd.read_csv('../input/predict-future-sales/calendar.csv')

calendar['date_block_num'] = (calendar['year'] - 2013)*12 + (calendar['month'] - 1)
calendar

In [None]:
#休日日数のカウント
calendar['hdays'] = calendar['mdays'] - calendar['wdays']
calendar.drop(['year'], axis = 1, inplace = True)
calendar

In [None]:
#気候情報を追加
climate = pd.read_csv('../input/predict-future-sale-climate/climate.csv', index_col = 0)
climate

In [None]:
# calendarの特徴量をmatrixに追加する
matrix = pd.merge(matrix, calendar, on=['date_block_num'], how='left')

# 型のキャスト
matrix['month'] = matrix['month'].astype(np.int8)
matrix['mdays'] = matrix['mdays'].astype(np.int8)
matrix['hdays'] = matrix['hdays'].astype(np.int8)
matrix['wdays'] = matrix['wdays'].astype(np.int8)

matrix.info()

In [None]:
# climateの特徴量をmatrixに追加する
matrix = pd.merge(matrix, climate, on=['month', 'place_ID'], how='left')

matrix['month_T'] = matrix['month_T'].astype(np.float16)
matrix['month_R'] = matrix['month_R'].astype(np.float16)

matrix

In [None]:
#必要のないカラムの削除
matrix.drop(['wdays', 'ID', 'shop_sale'], axis = 1, inplace = True)

In [None]:
#最初にアイテムが売れた月をワンホットで表現
first_item_block = matrix.groupby(['item_id'])['date_block_num'].min().reset_index()
first_item_block['item_first_sold'] = 1

first_shop_item_buy_block = matrix[matrix['date_block_num'] > 0].groupby(['shop_id', 'item_id'])['date_block_num'].min().reset_index()
first_shop_item_buy_block['first_date_block_num'] = first_shop_item_buy_block['date_block_num']

In [None]:
matrix = pd.merge(matrix, first_item_block[['item_id', 'date_block_num', 'item_first_sold']], on=['item_id', 'date_block_num'], how='left')
matrix = pd.merge(matrix, first_shop_item_buy_block[['item_id', 'shop_id', 'first_date_block_num']], on=['item_id', 'shop_id'], how='left')

matrix['first_date_block_num'].fillna(100, inplace=True)
matrix['shop_item_sold_before'] = (matrix['first_date_block_num'] < matrix['date_block_num']).astype('int8')
matrix.drop(['first_date_block_num'], axis=1, inplace=True)

In [None]:
#それぞれ0で補完
matrix['item_first_sold'].fillna(0, inplace=True)
matrix['shop_item_sold_before'].fillna(0, inplace=True)
#キャスト
matrix['item_first_sold'] = matrix['item_first_sold'].astype('int8')  
matrix['shop_item_sold_before'] = matrix['shop_item_sold_before'].astype('int8') 

## 集約系特徴量

In [None]:
#後でラグ特徴量を作成するためにaggregation系特徴量を作成する
#item_idごとのitem価格の平均値
group = train.groupby( ["item_id"] ).agg({"item_price": ["mean"]})
group.columns = ["item_avg_item_price"]
group.reset_index(inplace = True)

matrix = matrix.merge( group, on = ["item_id"], how = "left" )
matrix["item_avg_item_price"] = matrix.item_avg_item_price.astype(np.float16)
matrix

In [None]:
#月ごとのitem価格の平均値
group = train.groupby( ["date_block_num","item_id"] ).agg( {"item_price": ["mean"]} )
group.columns = ["date_item_avg_item_price"]
group.reset_index(inplace = True)
matrix = matrix.merge(group, on = ["date_block_num","item_id"], how = "left")
matrix["date_item_avg_item_price"] = matrix.date_item_avg_item_price.astype(np.float16)

In [None]:
#月ごとのitemの売れた数の中央値
group = matrix.groupby( ["date_block_num", "item_id"] ).agg( {"item_cnt_month": ["median"]} )
group.columns = ["item_avg_cnt_all_shops"]
group.reset_index(inplace = True)
matrix = matrix.merge(group, on = ["date_block_num","item_id"], how = "left")
matrix["item_avg_cnt_all_shops"] = matrix.item_avg_cnt_all_shops.astype(np.float16)

In [None]:
#月ごとのitemカテゴリー・shopごとのitemの売れた数の中央値
group = matrix.groupby( ['date_block_num', 'item_category_id', 'shop_id'] ).agg( {"item_cnt_month": ["median"]} )
group.columns = ["date_avg_cnt_category"]
group.reset_index(inplace = True)
matrix = matrix.merge(group, on = ["date_block_num", 'item_category_id', 'shop_id'], how = "left")
matrix["date_avg_cnt_category"] = matrix.date_avg_cnt_category.astype(np.float16)

In [None]:
#月ごと・itemカテゴリーごとののitemの売れた数の中央値
group = matrix.groupby( ['date_block_num', 'item_category_id'] ).agg( {"item_cnt_month": ["median"]} )
group.columns = ["date_avg_cnt_category_all_shops"]
group.reset_index(inplace = True)
matrix = matrix.merge(group, on = ["date_block_num", 'item_category_id'], how = "left")
matrix["date_avg_cnt_category_all_shops"] = matrix.date_avg_cnt_category_all_shops.astype(np.float16)

In [None]:
#item_monthが0の月の中央値をとる
group = matrix.query('item_month==0').groupby(['date_block_num', 'item_category_id', 'shop_id']).agg( {'item_cnt_month': ['median']})
group.columns = ['date_avg_cnt_first_month']
group.reset_index(inplace = True)
matrix = matrix.merge(group, on = ['date_block_num', 'item_category_id', 'shop_id'], how = 'left')
matrix['date_avg_cnt_first_month'] = matrix.date_avg_cnt_first_month.astype(np.float16)

In [None]:
#item_monthが0の月の中央値をとる
group = matrix.query('item_month==0').groupby(['date_block_num', 'item_category_id']).agg( {'item_cnt_month': ['median']})
group.columns = ['date_avg_cnt_first_month_all_shops']
group.reset_index(inplace = True)
matrix = matrix.merge(group, on = ['date_block_num', 'item_category_id'], how = 'left')
matrix['date_avg_cnt_first_month_all_shops'] = matrix.date_avg_cnt_first_month_all_shops.astype(np.float16)

## Lag系の特徴量

In [None]:
#各ショップごとのラグ特徴量
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        # 列名の更新
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
#全ショップでのラグ特徴量
def lag_feature_all_shops(df, lags, col):
    tmp = df[['date_block_num','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        # 列名の更新
        shifted.columns = ['date_block_num','item_id', col+'_lag_noshop'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','item_id'], how='left')
    return df

In [None]:
#アイテム番号をひとつずらしたもののラグ特徴量
def lag_feature_item1(df, lags, col):
    tmp = df[['date_block_num','item_id', 'shop_id', col]]
    for i in lags:
        shifted = tmp.copy()
        # 列名の更新
        shifted.columns = ['date_block_num','item_id', 'shop_id', col+'_lag_item1'+str(i)]
        shifted['date_block_num'] += i
        shifted['item_id'] -= 1
        df = pd.merge(df, shifted, on=['date_block_num','item_id', 'shop_id'], how='left')
    return df

In [None]:
#priceの上がり下がりをとらえるための特徴量を作成する
lags = [1, 2, 3]
matrix = lag_feature( matrix, lags, 'date_item_avg_item_price')
for i in lags:
    matrix["change_price_lag_" + str(i) ] = (matrix["date_item_avg_item_price_lag_" + str(i)]- matrix["item_avg_item_price"] )/ matrix["item_avg_item_price"]
    
import math

def select_trends(row) :
    for i in lags:
        if (not math.isnan(row["change_price_lag_" + str(i)])):
            return row["change_price_lag_" + str(i)]
    return 0

matrix["change_price_lag"] = matrix.apply(select_trends, axis = 1)
matrix["change_price_lag"] = matrix.change_price_lag.astype( np.float16 )
matrix["change_price_lag"].fillna( 0 ,inplace = True)

In [None]:
#ラグ特徴量など必要ないものを削除
matrix.drop(['item_avg_item_price', 'date_item_avg_item_price', 'date_item_avg_item_price_lag_1', 'date_item_avg_item_price_lag_2', 'date_item_avg_item_price_lag_3', 'change_price_lag_1', 'change_price_lag_2', 'change_price_lag_3'], axis = 1, inplace = True)

In [None]:
# 1,2,3ヶ月前のitem_cnt_monthを特量量として追加する
matrix = lag_feature(matrix, [1,2,3], 'item_cnt_month')

In [None]:
#shopごとの平均価格
index_cols = ['shop_id', 'item_id', 'date_block_num']
group = train.groupby(index_cols)['item_price'].mean().reset_index().rename(columns={"item_price": "avg_shop_price"}, errors="raise")
matrix = pd.merge(matrix, group, on=index_cols, how='left')
matrix['avg_shop_price'] = (matrix['avg_shop_price'].fillna(0).astype(np.float16))

In [None]:
#すべてのshopでの平均価格
index_cols = ['item_id', 'date_block_num']
group = train.groupby(['date_block_num','item_id'])['item_price'].mean().reset_index().rename(columns={"item_price": "avg_item_price"}, errors="raise")
matrix = pd.merge(matrix, group, on=index_cols, how='left')
matrix['avg_item_price'] = (matrix['avg_item_price'].fillna(0).astype(np.float16))

In [None]:
#そのshopが値段が高いかどうかの指標
matrix['item_shop_price_avg'] = (matrix['avg_shop_price'] - matrix['avg_item_price']) / matrix['avg_item_price']
matrix['item_shop_price_avg'].fillna(0, inplace=True)

In [None]:
#lag特徴量計算
matrix = lag_feature(matrix, [1, 2, 3], 'item_shop_price_avg')

In [None]:
#必要のないものを削除
matrix.drop(['avg_shop_price', 'avg_item_price', 'item_shop_price_avg'], axis=1, inplace=True)
matrix

## ターゲットエンコーディングからラグ特徴量を作成する

In [None]:
#月・アイテムごとの売れた数の平均のラグをとる
item_id_target_mean = matrix.groupby(['date_block_num','item_id'])['item_cnt_month'].median().reset_index().rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise")
matrix = pd.merge(matrix, item_id_target_mean, on=['date_block_num','item_id'], how='left')
matrix['item_target_enc'] = (matrix['item_target_enc'].fillna(0).astype(np.float16))
matrix = lag_feature(matrix, [1, 2, 3], 'item_target_enc')
matrix.drop(['item_target_enc'], axis=1, inplace=True)

In [None]:
#月・アイテム・都市ごとの売れた数のラグをとる
item_city_id_target_mean = matrix.groupby(['date_block_num','item_id', 'city_code'])['item_cnt_month'].median().reset_index().rename(columns={"item_cnt_month": "item_city_target_enc"}, errors="raise")
matrix = pd.merge(matrix, item_city_id_target_mean, on=['date_block_num','item_id', 'city_code'], how='left')
matrix['item_city_target_enc'] = (matrix['item_city_target_enc'].fillna(0).astype(np.float16))
matrix = lag_feature(matrix, [1, 2, 3], 'item_city_target_enc')
matrix.drop(['item_city_target_enc'], axis=1, inplace=True)

In [None]:
#月・アイテム・店舗ごとの売れた数のラグをとる
item_shop_id_target_mean = matrix.groupby(['date_block_num','item_id', 'shop_id'])['item_cnt_month'].median().reset_index().rename(columns={"item_cnt_month": "item_shop_target_enc"}, errors="raise")
matrix = pd.merge(matrix, item_shop_id_target_mean, on=['date_block_num','item_id', 'shop_id'], how='left')
matrix['item_shop_target_enc'] = (matrix['item_shop_target_enc'].fillna(0).astype(np.float16))
matrix = lag_feature(matrix, [1, 2, 3], 'item_shop_target_enc')
matrix.drop(['item_shop_target_enc'], axis=1, inplace=True)

In [None]:
#月・店舗・商品カテゴリごとの売れた数のラグをとる
item_category_id_target_mean = matrix.groupby(['date_block_num','shop_id', 'item_category_id'])['item_cnt_month'].median().reset_index().rename(columns={"item_cnt_month": "item_category_target_enc"}, errors="raise")
matrix = pd.merge(matrix, item_category_id_target_mean, on=['date_block_num','shop_id', 'item_category_id'], how='left')
matrix['item_category_target_enc'] = (matrix['item_category_target_enc'].fillna(0).astype(np.float16))
matrix = lag_feature(matrix, [1, 2, 3], 'item_category_target_enc')
matrix.drop(['item_category_target_enc'], axis=1, inplace=True)

In [None]:
#月・商品カテゴリごとの売れた数のラグをとる
item_category_all_shops_target_mean = matrix.groupby(['date_block_num', 'item_category_id'])['item_cnt_month'].median().reset_index().rename(columns={"item_cnt_month": "item_category_all_shops_target_enc"}, errors="raise")
matrix = pd.merge(matrix, item_category_all_shops_target_mean, on=['date_block_num', 'item_category_id'], how='left')
matrix['item_category_all_shops_target_enc'] = (matrix['item_category_all_shops_target_enc'].fillna(0).astype(np.float16))
matrix = lag_feature(matrix, [1, 2, 3], 'item_category_all_shops_target_enc')
matrix.drop(['item_category_all_shops_target_enc'], axis=1, inplace=True)

In [None]:
#月・店舗・サブタイプコードごとの売れた数のラグをとる
item_subtype_code_target_mean = matrix.groupby(['date_block_num','shop_id', 'subtype_code'])['item_cnt_month'].median().reset_index().rename(columns={"item_cnt_month": "item_subtype_target_enc"}, errors="raise")
matrix = pd.merge(matrix, item_subtype_code_target_mean, on=['date_block_num','shop_id', 'subtype_code'], how='left')
matrix['item_subtype_target_enc'] = (matrix['item_subtype_target_enc'].fillna(0).astype(np.float16))
matrix = lag_feature(matrix, [1, 2, 3], 'item_subtype_target_enc')
matrix.drop(['item_subtype_target_enc'], axis=1, inplace=True)

In [None]:
#月・サブタイプコードごとの売れた数のラグをとる
item_subtype_all_shops_target_mean = matrix.groupby(['date_block_num', 'subtype_code'])['item_cnt_month'].median().reset_index().rename(columns={"item_cnt_month": "item_subtype_all_shops_target_enc"}, errors="raise")
matrix = pd.merge(matrix, item_subtype_all_shops_target_mean, on=['date_block_num', 'subtype_code'], how='left')
matrix['item_subtype_all_shops_target_enc'] = (matrix['item_subtype_all_shops_target_enc'].fillna(0).astype(np.float16))
matrix = lag_feature(matrix, [1, 2, 3], 'item_subtype_all_shops_target_enc')
matrix.drop(['item_subtype_all_shops_target_enc'], axis=1, inplace=True)

In [None]:
#初めて売れた月に関して月と商品化カテゴリごとの売れた数のラグをとる
item_first_id_target_mean = matrix[matrix['item_first_sold'] == 1].groupby(['date_block_num','item_category_id'])['item_cnt_month'].median().reset_index().rename(columns={
    "item_cnt_month": "new_item_category_avg"}, errors="raise")
matrix = pd.merge(matrix, item_first_id_target_mean, on=['date_block_num','item_category_id'], how='left')
matrix['new_item_category_avg'] = (matrix['new_item_category_avg'].fillna(0).astype(np.float16))
matrix = lag_feature(matrix, [1,2,3], 'new_item_category_avg')
matrix.drop(['new_item_category_avg'], axis=1, inplace=True)

In [None]:
#初めて売れた月に関して月と商品化カテゴリ及び店舗ごとの売れた数のラグをとる
item_first_shop_id_target_mean = matrix[matrix['item_first_sold'] == 1].groupby(['date_block_num','item_category_id', 'shop_id'])['item_cnt_month'].median().reset_index().rename(columns={
    "item_cnt_month": "new_item_shop_category_avg"}, errors="raise")
matrix = pd.merge(matrix, item_first_shop_id_target_mean, on=['date_block_num','item_category_id', 'shop_id'], how='left')
matrix['new_item_shop_category_avg'] = (matrix['new_item_shop_category_avg'].fillna(0).astype(np.float16))
matrix = lag_feature(matrix, [1, 2, 3], 'new_item_shop_category_avg')
matrix.drop(['new_item_shop_category_avg'], axis=1, inplace=True)

In [None]:
#先に作っておいた集約系のラグ特徴量を計算
matrix = lag_feature(matrix, [1, 2], 'item_avg_cnt_all_shops')
matrix = lag_feature(matrix, [1, 2], 'date_avg_cnt_category')
matrix = lag_feature(matrix, [1, 2], 'date_avg_cnt_category_all_shops')
matrix = lag_feature(matrix, [1, 2], 'date_avg_cnt_first_month')
matrix = lag_feature(matrix, [1, 2], 'date_avg_cnt_first_month_all_shops')

In [None]:
#アイテムid１個差のアイテムは似ていると考えラグ変数を計算
matrix = lag_feature_item1(matrix, [1, 2, 3], 'item_cnt_month')

In [None]:
#商品名のラベルエンコーディングの結果は非常にカテゴリの数が多く、処理できないためバイナリエンコーディングを行う
import category_encoders as ce

def binary_encode(df, letters, cols):
    encoder = ce.BinaryEncoder(cols=[f'item_name_{letters}'], return_df=True)
    temp = encoder.fit_transform(df[f'item_name_{letters}'])
    df = pd.concat([df,temp], axis=1)
    del df[f'item_name_{letters}_0']
    name_cols = [f'item_name_{letters}_{x}' for x in range(1,cols)]
    df[name_cols] = df[name_cols].astype('int8')
    return df

matrix = binary_encode(matrix, 'first5_e', 14)
matrix = binary_encode(matrix, 'last5_e', 14)
matrix.drop(['item_name_first3_e', 'item_name_first8_e', 'item_name_first5_e', 'item_name_last5_e'], axis = 1, inplace = True)

In [None]:
#集約系の特徴量はリークしてしまうので削除
matrix.drop(['item_avg_cnt_all_shops',
       'date_avg_cnt_category', 'date_avg_cnt_category_all_shops',
       'date_avg_cnt_first_month', 'date_avg_cnt_first_month_all_shops'], axis = 1, inplace = True)

In [None]:
#最後に目的変数をクリップする
matrix['item_cnt_month'] = matrix['item_cnt_month'].clip(0,20)

In [None]:
matrix.info()

In [None]:
matrix.to_pickle('data_210407_1.pkl')

In [None]:
import pandas as pd
matrix = pd.read_pickle('data_210407_1.pkl')

In [None]:
# 訓練用データ
X_train = matrix[(matrix.date_block_num < 33) & (matrix.date_block_num > 2)].drop(['item_cnt_month'], axis=1)
Y_train = matrix[(matrix.date_block_num < 33) & (matrix.date_block_num > 2)]['item_cnt_month']
# バリデーション用データ
X_valid = matrix[matrix.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = matrix[matrix.date_block_num == 33]['item_cnt_month']
# テストデータ
X_test = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
import lightgbm as lgb
feature_names = X_train.columns.tolist()

params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'num_leaves': 1023,
    'learning_rate': 0.01,
    'feature_fraction': 0.7,
    'min_data_in_leaf': 10,
    'seed': 1,
    'verbose': 1,
    'force_row_wise' : True
}

categorical_feature_names = [ 
                            'item_category_id',
                            'date_block_num', 
                            'city_code',
                            'type_code',
                            'subtype_code', 
                            'shop_id',
                            'shop_type',
                            'month',
                            ]

lgb_train = lgb.Dataset(X_train[feature_names], Y_train, categorical_feature=None)
lgb_eval  = lgb.Dataset(X_valid[feature_names], Y_valid, categorical_feature=None, reference=lgb_train)

evals_result = {}
gbm = lgb.train(
        params, 
        lgb_train,
        num_boost_round = 6000,
        valid_sets = (lgb_train, lgb_eval), 
        feature_name = feature_names,
        categorical_feature = categorical_feature_names,
        verbose_eval = 100, 
        evals_result = evals_result,
        early_stopping_rounds = 50)

In [None]:
lgb.plot_importance(
    gbm, 
    max_num_features=50, 
    importance_type='gain', 
    figsize=(12,8));

In [None]:
Y_test = gbm.predict(X_test[feature_names]).clip(0, 20)
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('lgb_submission_0407_1.csv', index=False)