# 0. Load Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install py7zr
import py7zr

In [None]:
import py7zr
from subprocess import check_output

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        archive = py7zr.SevenZipFile(os.path.join(dirname, filename), mode='r')
        archive.extractall(path="/kaggle/working")
        archive.close()

print(check_output(["ls", "../working"]).decode("utf8"))

In [None]:
df_train = pd.read_csv("../working/train.csv")
df_test = pd.read_csv("../working/test.csv")
df_sub = pd.read_csv("../working/sample_submission.csv")
df_stores = pd.read_csv("../working/stores.csv")
df_items = pd.read_csv("../working/items.csv")
df_trans = pd.read_csv("../working/transactions.csv")
df_oil = pd.read_csv("../working/oil.csv")
df_holiday = pd.read_csv("../working/holidays_events.csv")

# 1. Basic EDA

## 1) train.csv

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
df_train.head()

In [None]:
df_train.tail()

In [None]:
train_columns = df_train.columns.tolist()

for i in range(0, len(train_columns)):
    print("***",train_columns[i],"***")
    print(df_train[train_columns[i]].nunique(),'개')
    print(df_train[train_columns[i]].value_counts(normalize=False, sort=True, dropna=False))

## 2) test.csv
- Test data, with the date, store_nbr, item_nbr combinations that are to be predicted, along with the onpromotion information.
- NOTE: The test data has a small number of items that are not contained in the training data. Part of the exercise will be to predict a new item sales based on similar products.
- The public / private leaderboard split is based on time. All items in the public split are also included in the private split.


In [None]:
df_test.shape

In [None]:
df_test.info()

In [None]:
df_test.head()

In [None]:
df_test.tail()

In [None]:
test_columns = df_test.columns.tolist()

for i in range(0, len(test_columns)):
    print("***",test_columns[i],"***")
    print(df_test[test_columns[i]].nunique(),'개')
    print(df_test[test_columns[i]].value_counts(normalize=False, sort=True, dropna=False))

In [None]:
# onpromotion true 데이터 확인

df_test[df_test['onpromotion'] == True ]

## 3) sample_submission.csv

In [None]:
df_sub.head()

## 4) stores.csv
- Store metadata, including city, state, type, and cluster.
- cluster is a grouping of similar stores.


In [None]:
df_stores.shape

In [None]:
df_stores.head()

In [None]:
df_stores.tail()

In [None]:
stores_columns = df_stores.columns.tolist()

for i in range(1, len(stores_columns)):
    print("***",stores_columns[i],"***")
    print(df_stores[stores_columns[i]].nunique(),'개')
    print(df_stores[stores_columns[i]].value_counts(normalize=False))


In [None]:
df_stores['city'].nunique()

## 5) items.csv

- Item metadata, including family, class, and perishable.
- NOTE: Items marked as perishable have a score weight of 1.25; otherwise, the weight is 1.0.



In [None]:
df_items.shape

In [None]:
df_items.info()

In [None]:
df_items.head()

In [None]:
df_items.tail()

In [None]:
item_columns = df_items.columns.tolist()

for i in range(0, len(item_columns)):
    print("***",item_columns[i],"***")
    print(df_items[item_columns[i]].nunique(),'개')
    print(df_items[item_columns[i]].value_counts(normalize=False))

In [None]:
# perishable 제품에 주로 어던 카테고리가 속하는지 확인

df_items[df_items['perishable'] == 1]['family'].value_counts(normalize=False)

## 6) transactions.csv

- The count of sales transactions for each date, store_nbr combination. Only included for the training data timeframe.



In [None]:
df_trans.shape

In [None]:
df_trans.head()

In [None]:
df_trans.tail()

In [None]:
df_trans['transactions'].describe()

In [None]:
trans_columns = df_trans.columns.tolist()

for i in range(0, len(trans_columns)):
    print("***",trans_columns[i],"***")
    print(df_trans[trans_columns[i]].nunique(),'개')
    print(df_trans[trans_columns[i]].value_counts(normalize=False))

## 7) oil.csv

- Daily oil price. Includes values during both the train and test data timeframe. (Ecuador is an oil-dependent country and it's economical health is highly vulnerable to shocks in oil prices.)



In [None]:
df_oil.shape

In [None]:
df_oil.info()

In [None]:
df_oil.head()

In [None]:
df_oil.tail()

In [None]:
oil_columns = df_oil.columns.tolist()

for i in range(0, len(oil_columns)):
    print("***",oil_columns[i],"***")
    print(df_oil[oil_columns[i]].nunique(),'개')
    print(df_oil[oil_columns[i]].value_counts(normalize=False, sort=True, dropna=False))

In [None]:
# NaN값이 있는 일자 확인

df_oil[df_oil['dcoilwtico'] == 'NaN']
# null_value = df_oil.loc[[0],['dcoilwtico']].to_string
# null_value

## 8) holidays_events.csv
- Holidays and Events, with metadata
- NOTE: Pay special attention to the transferred column. A holiday that is transferred officially falls on that calendar day, but was moved to another date by the government. A transferred day is more like a normal day than a holiday. To find the day that it was actually celebrated, look for the corresponding row where type is Transfer. For example, the holiday Independencia de Guayaquil was transferred from 2012-10-09 to 2012-10-12, which means it was celebrated on 2012-10-12. Days that are type Bridge are extra days that are added to a holiday (e.g., to extend the break across a long weekend). These are frequently made up by the type Work Day which is a day not normally scheduled for work (e.g., Saturday) that is meant to payback the Bridge.
- Additional holidays are days added a regular calendar holiday, for example, as typically happens around Christmas (making Christmas Eve a holiday).


In [None]:
df_holiday.shape

In [None]:
df_holiday.info()

In [None]:
df_holiday.head(50)

In [None]:
df_holiday.tail()

In [None]:
df_holiday[df_holiday['type'] == 'Bridge']

In [None]:
df_holiday[df_holiday['type'] == 'Work Day']

In [None]:
holiday_columns = df_holiday.columns.tolist()

for i in range(0, len(holiday_columns)):
    print("***",holiday_columns[i],"***")
    print(df_holiday[holiday_columns[i]].nunique(),'개')
    print(df_holiday[holiday_columns[i]].value_counts(normalize=False, sort=True, dropna=False))

# Basic Visualization

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

## 1) stores

In [None]:
df_stores.head()

In [None]:
# Stores distribution across cities

fig, ax = plt.subplots()
fig.set_size_inches(8, 8)
ax = sns.countplot(y=df_stores['city'], data=df_stores) 

- Quito, Guayaquil 시에 압도적으로 많은 수의 상점이 위치해있음을 확인할 수 있음 

In [None]:
# Stores distribution across states

fig, ax = plt.subplots()
fig.set_size_inches(8, 8)
ax = sns.countplot(y=df_stores['state'], data=df_stores) 

- Quito, Guayaquil시를 포함하는 Pichincha, Guayas 주에 상점이 많이 분포되어 있음

In [None]:
# Different types of stores

fig, ax = plt.subplots()
fig.set_size_inches(10, 7)
ax = sns.countplot(x="type", data=df_stores, palette="Set2")

- 상점의 개수는 D > C > A > B > E 순으로 분포되어 있음

In [None]:
# Types of stores across cities

ct = pd.crosstab(df_stores.city, df_stores.type)
ct

In [None]:
ct.plot.bar(figsize = (12, 6), stacked=True)
plt.legend(title = 'type')

plt.show()

In [None]:
# Types of stores across states

ct = pd.crosstab(df_stores.state, df_stores.type)

ct.plot.bar(figsize = (12, 6), stacked=True)
plt.legend(title = 'type')

plt.show()

- C와 D class 상점들은 다양한 주에 포진되어 있음
- Guayas주는 유일하게 모든 등급의 상점이 위치해 있음
- Pichincha 주는 가장 많은 상점이 분포해 있으나, E등급의 상점은 입점해 있지 않음

In [None]:
df_stores.sort_values(by=['state'])

In [None]:
# Distribution of different clusters

fig, ax = plt.subplots()
fig.set_size_inches(12, 7)
ax = sns.countplot(x="cluster", data=df_stores)

In [None]:
# Types against clusters

plt.style.use('seaborn-white')

type_cluster = df_stores.groupby(['type','cluster']).size()
type_cluster
# diffrence between .size() vs .count()
# => size includes NaN values, count does not:

In [None]:
type_cluster.unstack().plot(kind='bar',stacked=True, colormap= 'PuBu', figsize=(13,11),  grid=False)
plt.title('Stacked Barplot of Store types and their cluster distribution', fontsize=18)
plt.ylabel('Count of clusters in a particular store type', fontsize=16)
plt.xlabel('Store type', fontsize=16)
plt.show()

In [None]:
# cluster of stores across the different cities

plt.style.use('seaborn-white')
city_cluster = df_stores.groupby(['city','cluster']).store_nbr.size()
city_cluster.unstack().plot(kind='bar',stacked=True, colormap= 'viridis', figsize=(13,11),  grid=False)
plt.title('Stacked Barplot of Store cluster opened for each city')
plt.ylabel('Count of stores for a particular city')
plt.show()

## 2) items

In [None]:
df_items.head()

In [None]:
# Distribution of various families of items

fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
ax = sns.countplot(y = "family", data = df_items)

In [None]:
# Distribution of perishable goods by family

ct = pd.crosstab(df_items.family, df_items.perishable)
ct.plot.bar(figsize = (12, 7), stacked=True)
plt.legend(title='perishable')
plt.show()

In [None]:
# Distrbution of number of unique classes per family of items.

xc = df_items.groupby(['family'])['class'].nunique()
xc

In [None]:
xc2 = df_items.groupby(['family'])['class'].size()
xc2

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(12, 6)
xc.plot.bar(color='skyblue')
plt.show()

## 3) transactions

In [None]:
df_trans.head()

In [None]:
# 스토어별 총 업무량 규모

amount = (df_trans.groupby(['store_nbr']).sum())
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
ax = sns.barplot(x = amount.index, y= "transactions", data = amount)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 75, fontsize = 9)

In [None]:
amount_trans = pd.merge(amount, df_stores, left_on='store_nbr', right_on='store_nbr', how='left')
amount_trans.sort_values(by=['transactions'], ascending=False)

In [None]:
# 스토어별 총 업무량 횟수

number = (df_trans.groupby(['store_nbr']).count())
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
ax = sns.barplot(x = number.index, y= "transactions", data = number)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 75, fontsize = 9)

In [None]:
number_trans= pd.merge(number, df_stores, left_on='store_nbr', right_on='store_nbr', how='left')
number_trans.sort_values(by=['transactions'])

In [None]:
# 일자별 총 업무량 규모

amount_date = (df_trans.groupby(['date']).sum())
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
ax = sns.barplot(x = amount_date.index, y= "transactions", data = amount_date)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 75, fontsize = 9)

In [None]:
amount_date

In [None]:
# 일자정보 연/월/일로 나누기

re_trans = df_trans.copy()
dates = re_trans['date'].str.split('-')
print(dates)

In [None]:
re_trans['year'] = dates.str.get(0)
re_trans['month'] = dates.str.get(1)
re_trans['day'] = dates.str.get(2)
# re_trans.drop('date',inplace=True, axis=1)
re_trans = re_trans[['year','month','day','store_nbr','transactions']]
re_trans

In [None]:
# 일일 업무량이 가장 큰 데이터
df_trans[df_trans['transactions'] == df_trans['transactions'].max()]

In [None]:
# transactions양이 가장 많은 날(top 100)
top_trans = df_trans.nlargest(100, 'transactions')
print(top_trans.date.unique())

In [None]:
# 일잂 업무량이 가장 작은 데이터
df_trans[df_trans['transactions'] == df_trans['transactions'].min()]

In [None]:
# transactions양이 가장 적은 날(top 100)
lower_transe = df_trans.sort_values('transactions', ascending=True).head(100)
print(lower_transe.date.unique())

In [None]:
# 연도별 총 거래량 변화

year_amount = (re_trans.groupby(['year']).sum())
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
ax = sns.barplot(x = year_amount.index, y= "transactions", data = year_amount)
ax.set_xticklabels(ax.get_xticklabels(), fontsize = 9)

In [None]:
year_amount[['transactions']]

In [None]:
re_trans

In [None]:
# 각 월별 업무량 변화 추이 (x= year / hue = month)

## 4) oil

In [None]:
df_oil.head()

In [None]:
# missingno 라이브러리 활용하여 null값 확인

import missingno as msno

msno.matrix(df_oil)

In [None]:
# oil 가격 최대치 일자 확인
df_oil[df_oil['dcoilwtico'] == df_oil['dcoilwtico'].max()]

In [None]:
# oil 가격 최소치 일자 확인
df_oil[df_oil['dcoilwtico'] == df_oil['dcoilwtico'].min()]

In [None]:
# ax = sns.boxplot(x=df_oil["dcoilwtico"])
sns.boxplot(x = "dcoilwtico",  data = df_oil)
plt.show()

In [None]:
df_oil['dcoilwtico'].describe()

In [None]:
# 오일가격의 변화추이
sns.lineplot(data=df_oil, x="date", y="dcoilwtico")

## 5) holidays_events

In [None]:
df_holiday.head()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 6)
ax = sns.countplot( y="type", data=df_holiday, palette="RdBu")

In [None]:
plt.style.use('seaborn-white')
holiday_local_type = df_holiday.groupby(['locale_name', 'type']).size()
holiday_local_type.unstack().plot(kind='bar',stacked=True, colormap= 'magma_r', figsize=(12,10),  grid=False)
plt.title('Stacked Barplot of locale name against event type')
plt.ylabel('Count of entries')
plt.show()

## 6) train

In [None]:
df_train.head()

In [None]:
amount_store = (df_train.groupby(['store_nbr']).sum())
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
ax = sns.barplot(x = amount_store.index, y= "unit_sales", data = amount_store)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 75, fontsize = 9)

In [None]:
amount_sales = pd.merge(amount_store, df_stores, left_on='store_nbr', right_on='store_nbr', how='left')
amount_sales.sort_values(by=['unit_sales'], ascending=False)

In [None]:
# 아이템하별 판배 총 합

amount_item = (df_train.groupby(['item_nbr']).sum())
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
ax = sns.barplot(x = amount_item.index, y= "unit_sales", data = amount_item)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 75, fontsize = 9)

In [None]:
# 판매 상위 10위 아이템  

item_sales = pd.merge(amount_item, df_items, left_on='item_nbr', right_on='item_nbr', how='left')
item_sales.sort_values(by=['unit_sales'], ascending=False).head(10)

In [None]:
# 판매 하위 10위 아이템  

item_sales = pd.merge(amount_item, df_items, left_on='item_nbr', right_on='item_nbr', how='left')
item_sales.sort_values(by=['unit_sales'], ascending=True).head(10)

In [None]:
set(df_train['onpromotion'])

In [None]:
df_train.onpromotion.hist()