In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBRegressor
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

import time
import sys
import gc
import pickle
sys.version_info

plt.style.use('default')

In [None]:
!pip install plotly

In [None]:
import plotly.graph_objects as go

In [None]:

items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
cats = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
# set index to ID to avoid droping it later
test  = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')

# items
* item_name=the product name
* item_id=link to the train
* item_category_id=link to the train and cats


* item_name =製品名
* item_id =train.csvへのリンク
* item_category_id =train.csvとcats.csvへのリンク

In [None]:

items.head()

In [None]:
#data = go.Bar(x=items['item_category_id'].value_counts().sort_values(ascending=False).index, y=items['item_category_id'].value_counts().sort_values(ascending=False),
#             marker=dict(color=np.random.randn(100),colorscale='hot'))

data = go.Bar(x=items['item_category_id'].value_counts().sort_values(ascending=False).index, y=items['item_category_id'].value_counts().sort_values(ascending=False))
layout = go.Layout(showlegend=False)
fig = go.Figure(data, layout)
fig.update_layout(title='The number of items each categories')

# shops

* shop_name=Each shop_name starts with the city name.
* shop_id=link to the items and train



* shop_name =各shop_nameは都市名で始まります。
* shop_id =train.csv,items.csvへのリンク

In [None]:
shops.head()

# cats

* item_category_name=category name
* item_category_id=link to the items and train


* item_category_name =カテゴリ名
* item_category_id =items.csv,train.csvへのリンク

In [None]:
print('item_category counts=',len(items['item_category_id'].unique()))
cats.head()

# train

* date=The date when the item was saled.
* date_block_num=The counts of the months.
* shop_id=This number link to shop.csv.
* item_id=This number link to items.csv.
* item_price=The price of items.They are different from each shops.
* item_cnt_day=How many items was saled in the day.Negative numbers are returns.



* date =アイテムが販売された日付。
* date_block_num =月の数。
* shop_id =この番号はshop.csvにリンクしています。
* item_id =この番号はitems.csvにリンクしています。
* item_price =アイテムの価格。ショップごとに異なります。
* item_cnt_day = 1日に販売されたアイテムの数。負の数は返品です。

In [None]:
#split day ,month,year.
data=train['date'].str.split('.', expand=True)
data.columns=['day','month','year']
#concat month and year
data['year_month']=data['year']+data['month']
train_concat=pd.concat([train,data[['year_month']]],axis=1)
train_concat

train started from '2013-01' and end  '2015-10'. And 34 months are in the datasets.
34 months are same to the number of date_block_num.

trainは「2013-01」から始まり「2015-10」で終わります。 そして、34か月がデータセットにあります。
34か月はdate_block_numの数と同じです。

In [None]:
print(data['year_month'].unique())
print('length of months=',len(data['year_month'].unique()))
print('The number of date_block_num=',len(train['date_block_num'].unique()))

Then I will see how many the pairs of shop_id and item_id are.

次に、shop_idとitem_idのペアがいくつあるかを確認します。

In [None]:
data_mean=train.groupby(['shop_id', 'item_id'], as_index=False).mean()
data_mean

The number of shop_id and item_id pair is 424123.

shop_idとitem_idのペアの数は424123です。



In [None]:
print('counts of items (items.csv-train.csv) =',len(items['item_id'].unique())-len(train['item_id'].unique()))


data = go.Bar(x=['items of items.csv','items of train.csv'], y=[len(items['item_id'].unique()),len(train['item_id'].unique())]
              ,marker=dict(color=np.random.randn(100),colorscale='hot'))

layout = go.Layout(showlegend=False)
fig = go.Figure(data, layout)
fig.update_layout(title='The number of items_id')


train.csvに比べて363アイテムが新品です。 新品のアイテムはテストセットの項目について、Target値はゼロである必要があります。

Then let's see the prices are different each shops,
for exsample  item_id = 39,40,41.

店ごとに値段が違うのを見てみましょう、
item_id = 39,40,41の場合。

In [None]:
for i in [40,41,43,45]:
    data_a=data_mean[data_mean['item_id']==i]['item_price'].value_counts().sort_values(ascending=False)
    data = go.Bar(x=data_a.index, y=data_a)#marker=dict(color=np.random.randn(100),colorscale='turbo_r')

    layout = go.Layout(showlegend=False,xaxis=dict(title='price'),yaxis=dict(title='counts'))
    fig = go.Figure(data, layout)
    fig.update_layout(title='items_id='+str(i))
    fig.show()

# test

In [None]:
test.head()

# Null

In [None]:
#Null
print('train.csv')
print(train.isnull().sum())
print('_______')
print('items.csv')
print(items.isnull().sum())
print('_______')
print('shops.csv')
print(shops.isnull().sum())
print('_______')
print('cats.csv')
print(cats.isnull().sum())
print('_______')

Great! There are no null.

# Outliers

In [None]:
print('train.csv')
t=train.describe()
print(t)
print('_______')
print('items.csv')
i=items.describe()
print(i)
print('_______')
print('shops.csv')
s=shops.describe()
print(s)
print('_______')
print('cats.csv')
c=cats.describe()
print(c)
print('_______')

There are items with strange prices and sales. After detailed exploration I decided to remove items with price > 100000 and sales > 1001 (1000 is ok).

価格や売り上げがおかしい商品があります。 詳細な調査の結果、価格が100000を超え、売上が1001を超えるアイテムを削除することにしました（1000で問題ありません）。

Reflection:[feature-engineering-xgboost](https://www.kaggle.com/dlarionov/feature-engineering-xgboost)

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=train.item_cnt_day)

plt.figure(figsize=(10,4))
plt.xlim(train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x=train.item_price)

In [None]:
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

There is one item with price below zero. Fill it with median.

価格がゼロ未満のアイテムが1つあります。 中央値で埋めます。

In [None]:
train.loc[train['item_price']<0,'item_price']=train['item_price'].median()

# Submission

In [None]:
submission =pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
submission.head()

## Target=item_cnt_month

Target is the item_cnt_month.
I need to change the item_cnt_day to item_cnt_month.

ターゲットはitem_cnt_month(月間売上高)です。
item_cnt_dayをitem_cnt_monthに変更する必要があります。


Let's see the number of sales each shops and items in one month.

次に、各ショップとアイテムの1か月間の売上高を見てみましょう。

In [None]:
#split day ,month,year
data_b=train_concat.groupby(['shop_id', 'item_id','year_month'], as_index=False).sum()
data_b = data_b.rename(columns={'item_cnt_day': 'item_cnt_month'})

Several shops are duplicates of each other (according to its name). Fix train and test set.
いくつかの店は（その名前によると）重複です。 trainとtestを修正します。

In [None]:
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

# Shops/Cats/Items preprocessing

I need to make more code from shops_name and category_name.
Becouse these can be the features that is important.

shop_nameとcategory_nameからさらにコードを作成する必要があります。

これらは重要な機能である可能性があるためです。

In [None]:
#make city code from shop_name
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]
shops.head()

In [None]:
#make category type code and subtype code from item_category_name
cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
# if subtype is nan then type
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())

cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code', 'subtype_code']]

cats.head()

In [None]:
items.drop(['item_name'], axis=1, inplace=True)

I'm continueing to write the notebook.