In [None]:
from subprocess import check_output
print(check_output(["ls", "../input/instacart-market-basket-analysis"]).decode("utf8"))

In [None]:
import os
import zipfile

for dirname, _, filenames in os.walk('/kaggle/input/instacart-market-basket-analysis/'):
    for filename in filenames:        
        archive = zipfile.ZipFile(os.path.join(dirname, filename), mode='r')
        archive.extractall(path="/kaggle/working")
        archive.close()

print(check_output(["ls", "../working"]).decode("utf8"))

In [None]:
import pandas as pd

order_products_train = pd.read_csv("../working/order_products__train.csv")
order_products_prior = pd.read_csv("../working/order_products__prior.csv")
orders = pd.read_csv("../working/orders.csv")
products = pd.read_csv("../working/products.csv")
aisles = pd.read_csv("../working/aisles.csv")
departments = pd.read_csv("../working/departments.csv")
sample_submission = pd.read_csv("../working/sample_submission.csv")

print('*** df 생성 완료 ***')

In [None]:
orders

In [None]:
orders[orders['order_id']==1]

In [None]:
order_products_train[order_products_train['order_id']==1]

In [None]:
orders.eval_set.unique()

In [None]:
filter_orders = orders[(orders['eval_set'] == 'train') | (orders['eval_set'] == 'prior')]
filter_orders.eval_set.unique()

In [None]:
print(filter_orders.order_id.nunique())
order_products_train.order_id.nunique() + order_products_prior.order_id.nunique() 

## Merge data set : products + aisles + departments

In [None]:
merged_products = pd.merge(products, aisles, on='aisle_id', how='left')
merged_products = pd.merge(merged_products, departments, on='department_id', how='left')
merged_products = merged_products.reindex(columns=['product_id','product_name','aisle_id','aisle','department_id','department'])
merged_products.head()

## Merge data set : order_products_train + order_products_prior

In [None]:
train_prior = pd.concat([order_products_train, order_products_prior])
train_prior

## Merge data set : train_prior + merged_products

In [None]:
train_prior = pd.merge(train_prior, merged_products, on='product_id', how='left')
train_prior

In [None]:
del merged_products

## Merge data set : filter_orders + merged_products

In [None]:
print(filter_orders.shape)
train_prior.shape

In [None]:
merged_df = train_prior.merge(filter_orders, left_on='order_id', right_on='order_id', how='outer')
merged_df

In [None]:
del order_products_train
del order_products_prior
del orders
del products
del aisles
del departments
del sample_submission

In [None]:
print(merged_df.isnull().sum())
merged_df.info()

In [None]:
merged_df.order_id.nunique()

In [None]:
# 메모리 문제로 계속 에러가 발생하여, 우선 1/3 데이터만 활용

merged_df = merged_df.head(10000000)

## 불필요한 칼럼 제거 

In [None]:
filtered = merged_df.drop(['add_to_cart_order','reordered','user_id','eval_set','order_number','order_dow','order_hour_of_day','days_since_prior_order','product_id','product_name','aisle_id','aisle','department_id'], axis = 1)
filtered

In [None]:
onehot_filtered = pd.get_dummies(filtered)
onehot_filtered

In [None]:
onehot_filtered.groupby(by=['order_id']).sum()
onehot_filtered

## 상관계수

In [None]:
df = onehot_filtered.corr()
df

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt 
import seaborn as sns   
import numpy as np

# 그림 사이즈 지정
fig, ax = plt.subplots( figsize=(25,20) )

# 삼각형 마스크를 만든다(위 쪽 삼각형에 True, 아래 삼각형에 False)
mask = np.zeros_like(df, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# 히트맵을 그린다
sns.heatmap(df, 
            cmap = 'RdYlBu_r', 
            annot = True,   # 실제 값을 표시한다
            mask=mask,      # 표시하지 않을 마스크 부분을 지정한다
            linewidths=.5,  # 경계면 실선으로 구분하기
            cbar_kws={"shrink": .5},# 컬러바 크기 절반으로 줄이기
            vmin = -1,vmax = 1   # 컬러바 범위 -1 ~ 1
           )  
plt.show()