## ライブラリやデータのインポート

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pandas_profiling

In [None]:
from pandas_profiling import ProfileReport

In [None]:
items = pd.read_csv(os.path.join(dirname, 'items.csv'))
shops = pd.read_csv(os.path.join(dirname, 'shops.csv'))
cats = pd.read_csv(os.path.join(dirname, 'item_categories.csv'))
train = pd.read_csv(os.path.join(dirname, 'sales_train.csv'))
# set index to ID to avoid droping it later
test  = pd.read_csv(os.path.join(dirname, 'test.csv')).set_index('ID')


### File descriptions

- sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.
- test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.
- sample_submission.csv - a sample submission file in the correct format.
- items.csv - supplemental information about the items/products.
- item_categories.csv  - supplemental information about the items categories.
- shops.csv- supplemental information about the shops.

### Data fields

- ID - an Id that represents a (Shop, Item) tuple within the test set
- shop_id - unique identifier of a shop
- item_id - unique identifier of a product
- item_category_id - unique identifier of item category
- item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
- item_price - current price of an item
- date - date in format dd/mm/yyyy
- date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
- item_name - name of item
- shop_name - name of shop
- item_category_name - name of item category


## データの確認（まずは基本となるtrain.csvから）

In [None]:
# データの閲覧
train.head()

In [None]:
train.tail()

In [None]:
# 型の確認
train.info()

In [None]:
# date: object型->datetime型に変換して処理
# objectだとpandas-profilingの処理も重い
train["date"] = pd.to_datetime(train["date"])

In [None]:
# data量の確認
# pandas profilingは重いので少し時間がかかりそう
train.shape

In [None]:
# NULLチェック
train.isnull().sum()

In [None]:
# shop数
len(train["shop_id"].unique())

In [None]:
# shop毎のレコード数
train.groupby("shop_id").count().iloc[:,0].sort_values()

In [None]:
# item数
len(train["item_id"].unique())

In [None]:
# 統計量の確認
train.describe()

In [None]:
## item_price がマイナスのデータがある
train[train["item_price"] == -1]

## Train/Testの差をみる

In [None]:
# shop_idの比較
test_shops = test.shop_id.unique()
train_shops = train.shop_id.unique()
intersection = set(train_shops)&set(test_shops)
print(len(test_shops))
print(len(train_shops))
print(len(intersection))
print(intersection)


In [None]:
train_items = train.item_id.unique()
test_items = test.item_id.unique()
intersection = set(train_items)&set(test_items)
print(len(test_items))
print(len(train_items))
print(len(intersection))

In [None]:
# testデータにないshop_idのデータはdropする
train = train[train.shop_id.isin(test_shops)]
# testデータにないitem_idのデータはdropする
train = train[train.item_id.isin(test_items)]

## ただし、データを残したほうが精度が上がる可能性もあるし、目的によっては残したほうがよい可能性もある
## ここでは扱いやすさを重視してデータをdropする。

In [None]:
# 2935849 -> 1224439 なので大きく減少
train.shape

## Pandas Profiling

In [None]:
profile = ProfileReport(train.sample(frac=0.01), title="Pandas Profiling Report")

In [None]:
profile

## データ確認（全部マージ）

In [None]:
items.head()

In [None]:
shops.head()

In [None]:
shops

In [None]:
cats.head()

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
cats

- Аксессуары: アクセサリー
- Игровые консоли: ゲーム機
- Игры: ゲーム
- Карты оплаты: 支払いカード
- Кино: 映画
- Книги: 書籍
- Музыка: 音楽
- Подарки: プレゼント
- Программы: プログラム
- Служебные: サービス
- Элементы питания: バッテリー

In [None]:
def merge_data(train):
    # merge all csv files
    train = pd.merge(train,shops,how="left",on="shop_id")
    train = pd.merge(train,items,how="left",on="item_id")
    train = pd.merge(train,cats,how="left",on="item_category_id")
    return train
    

In [None]:
train_merged = merge_data(train)
test_merged = merge_data(test)

In [None]:
train_merged.head()

In [None]:
def fix_duplicates(train):
    # Several shops are duplicates of each other (according to its name).
    # Fix train and test set.
    # Якутск Орджоникидзе, 56
    train.loc[train.shop_id == 0, 'shop_id'] = 57
    # Якутск ТЦ "Центральный"
    train.loc[train.shop_id == 1, 'shop_id'] = 58
    # Жуковский ул. Чкалова 39м²
    train.loc[train.shop_id == 10, 'shop_id'] = 11
    return train


In [None]:
train_merged = fix_duplicates(train_merged)
test_merged = fix_duplicates(test_merged)

## Pandas Profiling for Merged Data

In [None]:
# データ量が多いのでサンプリング
train_merged_sample = train_merged.sample(frac=0.1)
train_merged_sample.shape

In [None]:
profile_merged = ProfileReport(train_merged_sample, title="Pandas Profiling Report")

In [None]:
profile_merged

## 特徴量エンジニアリング

In [None]:
# 時系列の特徴量作成
def make_date_features(train):
    if train["date"].dtype == "object":
        train["date"] = pd.to_datetime(train["date"])
        train["year"] = train.date.dt.year
        train["month"] = train.date.dt.month
        #train["day"] = train.date.dt.day    
        #train["dayofweek"] = train.date.dt.dayofweek
        return train
    elif train["date"].dtype == "datetime":
        print("Skipped: date column is already datetime.")
        return train
    else:
        print("Please check dtype of date column: {0}".format(train["date"].dtype))
   


In [None]:
train_merged["sales_day"] = train_merged["item_price"] * train_merged["item_cnt_day"]
train_merged.groupby(["date_block_num", "shop_id", "item_id"]).sum()["sales_day"].hist(bins=1000, log=True)

In [None]:
train_merged.groupby("date_block_num").sum()["sales_day"]

In [None]:
train_merged.groupby("date_block_num").sum()["sales_day"].plot()

In [None]:
train_merged.groupby("date_block_num").sum()["item_cnt_day"]

In [None]:
train_merged.groupby("date_block_num").sum()["item_cnt_day"].plot()

In [None]:
train_merged.groupby("date_block_num").mean()["item_price"].plot()

In [None]:
train_merged.head()

## 月毎,shop毎,item毎集計

In [None]:
def make_monthly_sales(sales):
    return sales.groupby(["date_block_num", "shop_id", "item_id"])[
        ["date", "item_price", "item_cnt_day"]
    ].agg(
        {"date": ["min", "max"], "item_price": "mean", "item_cnt_day": "sum"}
    )

In [None]:
train.head()

In [None]:
train_month = make_monthly_sales(train)

In [None]:
train_month.head()

In [None]:
def get_converted_multi_columns(df, *, to_snake_case=True):
    if to_snake_case:
        return [col[0] + "_" + col[1] for col in df.columns.values]
    else:
        return [col[0] + col[1].capitalize() for col in df.columns.values]


In [None]:
# 集約によって行・列がそれぞれネストしたので、フラットな行列に戻す
flat_cols = get_converted_multi_columns(train_month)
train_month.columns = flat_cols
train_month = train_month.reset_index()


In [None]:
train_month.head()

In [None]:
print(train.shape)
print(train_month.shape)

In [None]:
train_merged = merge_data(train)

In [None]:
train_merged = fix_duplicates(train_merged)

In [None]:
train_merged

In [None]:
profile_month = ProfileReport(train_merged, title="Pandas Profiling Report")

In [None]:
profile_month