# *EDA*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
plt.rc('font', family='malgun gothic')
plt.rc('axes', unicode_minus=False)
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")

od_raw = pd.read_csv(r"orders.csv")
display(od_raw.head(), od_raw.info(), od_raw.isnull().sum())

In [None]:
od = od_raw.copy()



od.columns = ["창고", "고객주문번호", "CJ주문번호", "주문유형", "주문날짜", "주문시간", "고객사코드", "주문금액", "품목순번",
             "품목코드", "브랜드", "품목수량", "품목금액", "수신여부", "주문생성시간", "택배구분", "상품주문번호", "중개업체주문번호",
             "접수여부", "배달예정점소코드", "배달예정사원코드", "터미널코드", "터미널소분류코드", "입력자ID", "입력일자",
             "입력시간", "권역구분", "배송처별주문분할여부", "송화인 광역주소", "송화인 지역주소", "수화인 광역주소", "수화인 지역주소",
             "주문월", "주문일", "주문요일", "주문시간대"]

od.주문날짜 = od.주문날짜.astype("datetime64")

# 1.F/C 별 탐색

##  1 - 1) F/C별 데이터 크기

In [None]:
# 창고별 데이터 크기 확인
od_kx = od[od.창고 == "KX007"]     # 곤지암
od_gp = od[od.창고 == 'GP001']  # 군포

print("곤지암fc 데이터 주문날짜 :", od_kx.주문날짜.min(), "~", od_kx.주문날짜.max())
print("곤지암fc 데이터 크기 : ", od_kx.shape[0])
print("군포fc 데이터 주문날짜 :", od_gp.주문날짜.min(), "~", od_gp.주문날짜.max())
print("군포fc 데이터 크기 : ", od_gp.shape[0])

In [None]:
plt.figure(figsize = (20, 12))
plt.plot(od_kx.groupby("주문날짜")["품목수량"].count(), marker = "o", c = "r", label = "곤지암")
plt.plot(od_gp.groupby("주문날짜")["품목수량"].count(), marker = "o", c = "b", label = "군포")
plt.title("F/C별 날짜에 따른 주문수", fontsize = 20)
plt.xticks(rotation = 45, fontsize = 15)
plt.legend()
plt.grid()
plt.show()

## 1 - 2) F/C별 고객사 총 품목수량 비율

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16,12))
od_kx.groupby("고객사코드")["품목수량"].sum().plot.pie(colors=sns.color_palette('pastel'), 
                                                                  ax=ax[0], autopct='%.2f%%', textprops = {"size" : 12})
od_gp.groupby("고객사코드")["품목수량"].sum().plot.pie(colors=sns.color_palette('pastel'), 
                                                                  ax=ax[1], autopct='%.2f%%', textprops = {"size" : 12})

ax[0].set_title("곤지암FC 고객사별 총 품목수량", fontsize = 15)
ax[1].set_title("군포FC 고객사별 총 품목수량", fontsize = 15)
plt.tight_layout()
plt.show()

# 2.주문유형 탐색

In [None]:
od.groupby("주문유형")["품목수량"].describe()

In [None]:
figure, ax = plt.subplots(1, 2, figsize = (12, 6))
ax[0].set_title("B2C출고")
ax[1].set_title("정상반출")
sns.boxplot(od[od.주문유형 == 7].품목수량.sort_values(ascending = False), ax = ax[0])
sns.boxplot(od[od.주문유형 == 8].품목수량.sort_values(ascending = False), ax = ax[1])
ax[0].set_xlim(0, 160000)
plt.show()

# 3. 곤지암 창고 B2C출고 품목수량 탐색

In [None]:
data = od_kx[od_kx.주문유형 == 7].groupby(['주문날짜'])['품목수량'].sum().reset_index()
data = data.append(pd.DataFrame(dict(zip(['주문날짜','품목수량'],[(pd.to_datetime('2021-06-28'),pd.to_datetime('2021-06-28')), (0,0)]))))
data = data.sort_values(['주문날짜'])
data.set_index("주문날짜", inplace = True)

data_top10 = data.sort_values("품목수량", ascending = False).head(10)
data_min = data.sort_values("품목수량", ascending = True).head(1)

plt.figure(figsize=(20,8))
plt.title("곤지암창고 주문날짜 별 총품목수량", fontsize = 17)
plt.plot(data.index, data.values, color='gray')
plt.scatter(data_top10.index, data_top10.values, color='red', s=40)
plt.scatter(data_min.index, data_min.values, color='red', s=40)
plt.grid()
plt.show()

In [None]:
data = od_kx[od_kx.주문유형 == 7].groupby(['주문날짜','주문시간대'])['품목수량'].sum().reset_index()
data = data.append(pd.DataFrame(dict(zip(['주문날짜','주문시간대','품목수량'],[(pd.to_datetime('2021-06-28'),pd.to_datetime('2021-06-28')), (4,5), (0,0)]))))
data = data.sort_values(['주문날짜','주문시간대'])

comb_date = []
for date, hour, _ in data.values:
    comb_date.append(pd.to_datetime(f'{date.year}-{date.month}-{date.day} {hour}:00:00'))
data['일자&시간대'] = comb_date
data = data.set_index('일자&시간대').drop(columns=["주문날짜", "주문시간대"]).rename(columns={'품목수량':'TARGET'})
data_top10 = data.sort_values('TARGET',ascending=False).head(10)

plt.figure(figsize=(20,8))
plt.title('곤지암 창고 시간대 별 총 품목수량', fontsize = 17)
plt.plot(data.index, data.values, color='gray')
plt.scatter(data_top10.index, data_top10.values, color='red', s=40)
plt.grid()
plt.show()

# 이상치 탐색

In [None]:
dd = od_kx[od_kx.창고 == "KX007"].groupby("고객사코드").품목수량.sum().sort_values(ascending = False)

plt.figure(figsize = (12, 12))
plt.title("고객사코드별 총 품목수량", fontsize = 15)
dd.plot.pie(autopct='%.2f%%', colors = sns.color_palette('pastel'))
plt.show()

In [None]:
data_f = od_kx[od_kx.창고 == "KX007"][od_kx.주문유형 == 7]

figure, ax = plt.subplots(figsize = (20, 8))
data_f.groupby("주문날짜").품목수량.sum().plot(ax = ax, label = "total", c = "black", lw = 3)
data_f[od_kx.고객사코드 == 90001302].groupby("주문날짜").품목수량.sum().plot(ax = ax, label = "90001302")
data_f[od_kx.고객사코드 == 90001542].groupby("주문날짜").품목수량.sum().plot(ax = ax, label = "90001542")
data_f[od_kx.고객사코드 == 90001541].groupby("주문날짜").품목수량.sum().plot(ax = ax, label = "90001541")
plt.legend(fontsize = 20)
plt.show()

In [None]:
figure, ax = plt.subplots(5, 2, figsize = (10, 25))
date_idx = data_f.groupby("주문날짜").품목수량.sum().sort_values(ascending = False)[:10].index
for i, date in enumerate(date_idx):
    ax[i%5, i//5].set_title(f"{date.year}-{date.month}-{date.day} 고객사코드별 총 품목수량")
    data_f[data_f.주문날짜 == date].groupby("고객사코드").품목수량.sum().plot.pie(autopct='%.2f%%', 
                                                                     colors = sns.color_palette('pastel'), ax = ax[i%5, i//5])
plt.tight_layout()
plt.show()

In [None]:
figure, ax = plt.subplots(5, 2, figsize = (10, 25))
date_idx = data_f.groupby("주문날짜").품목수량.sum().sort_values(ascending = False)[:10].index
for i, date in enumerate(date_idx):
    ax[i%5, i//5].set_title(f"{date.year}-{date.month}-{date.day} 고객사코드별 총 품목수량")
    data_f[data_f.주문날짜 == date][data_f.고객사코드 != 90001302].groupby("고객사코드").품목수량.sum().plot.pie(autopct='%.2f%%', 
                                                                     colors = sns.color_palette('pastel'), ax = ax[i%5, i//5])
    ax[i%5, i//5].set_ylabel("")
plt.tight_layout()
plt.show()

In [None]:
cus_com_id = [90001302, 90001541, 90001542]
for i in cus_com_id:
    print(i)
    print(data_f[data_f.고객사코드 == i].입력자ID.unique())
    print("\n")

## 90001302

In [None]:
r1 = [pd.to_datetime(f'2021-03-{i}') for i in range(22,29)]
r2 = [pd.to_datetime(f'2021-06-{i}') for i in range(7,16)]
r3 = [pd.to_datetime(f'2021-04-20'), pd.to_datetime(f'2021-05-17')]
lg = r1 + r2 + r3
dflg = data_f[data_f.고객사코드 == 90001302].groupby("주문날짜")["품목수량"].sum().reset_index().set_index("주문날짜")
dflg['lg'] = np.zeros((dflg.shape[0],1))
for day in lg:
    dflg.loc[day,'lg'] = "행사O"

dflg.loc[dflg["lg"] == 0,"lg"] = "행사X"

In [None]:
plt.figure(figsize = (20, 8))
sns.barplot(data = dflg.reset_index(), x = "주문날짜",y = "품목수량", hue = "lg", dodge = False)
plt.title("90001302", fontsize = 15)
plt.xticks(rotation = 90)
plt.legend(fontsize = 15)
plt.show()

## 90001541

In [None]:
ne= [pd.to_datetime("2021-03-22"), pd.to_datetime("2021-04-15"), 
         pd.to_datetime("2021-05-17"), pd.to_datetime("2021-06-17")]
dfne = data_f[data_f.고객사코드 == 90001541].groupby("주문날짜")["품목수량"].sum().reset_index().set_index("주문날짜")
dfne['ne'] = np.zeros((dfne.shape[0],1))
for day in ne:
    dfne.loc[day,'ne'] = "행사O"

dfne.loc[dfne["ne"] == 0,"ne"] = "행사X"

In [None]:
plt.figure(figsize = (20, 8))
sns.barplot(data = dfne.reset_index(), x = "주문날짜",y = "품목수량", hue = "ne", dodge = False)
plt.title("90001541", fontsize = 15)
plt.xticks(rotation = 90)
plt.legend(fontsize = 15)
plt.show()

## 90001542

In [None]:
nu = [pd.to_datetime("2021-04-25"), pd.to_datetime("2021-04-30"), 
         pd.to_datetime("2021-05-16"), pd.to_datetime("2021-05-31"), 
         pd.to_datetime("2021-06-13"), pd.to_datetime("2021-06-27")]
dfnu = data_f[data_f.고객사코드 == 90001542].groupby("주문날짜")["품목수량"].sum().reset_index().set_index("주문날짜")
dfnu['nu'] = np.zeros((dfnu.shape[0],1))
for day in nu:
    dfnu.loc[day,'nu'] = "행사O"

dfnu.loc[dfnu["nu"] == 0,"nu"] = "행사X"

In [None]:
plt.figure(figsize = (20, 8))
sns.barplot(data = dfnu.reset_index(), x = "주문날짜",y = "품목수량", hue = "nu", dodge = False)
plt.title("90001542", fontsize = 15)
plt.xticks(rotation = 90)
plt.legend(fontsize = 15)
plt.show()

## ANOVA Test

In [None]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

data = data_f.groupby(["주문날짜", "주문시간대"]
                     ).품목수량.sum().reset_index()
data = data.append(pd.DataFrame(dict(zip(['주문날짜','주문시간대','품목수량'],
                                       [(pd.to_datetime('2021-06-28'),pd.to_datetime('2021-06-28')), 
                                        (4,5), (0,0)])))).sort_values(by = ["주문날짜", "주문시간대"]).reset_index(drop = True)
data = data.set_index("주문날짜").drop("주문시간대", axis = 1).rename(columns = {"품목수량":"TARGET"})

#
# LG생건X네이버 레드위크
r1 = [pd.to_datetime(f'2021-03-{i}') for i in range(22,29)]
r2 = [pd.to_datetime(f'2021-06-{i}') for i in range(7,16)]
r3 = [pd.to_datetime(f'2021-04-20'), pd.to_datetime(f'2021-05-17')]
lg = r1 + r2 + r3
data['lg'] = np.zeros((data.shape[0],1))
for day in lg:
    data.loc[day,'lg'] = 1

# 네슬레 브랜드데이&구매왕 이벤트
n = [pd.to_datetime("2021-03-22"), pd.to_datetime("2021-04-15"), 
         pd.to_datetime("2021-05-17"), pd.to_datetime("2021-06-17")]

data['nestle'] = np.zeros((data.shape[0],1))
for day in n:
    data.loc[day,'nestle'] = 1



# 뉴트리원 쇼핑라이브 경품 이벤트
n = [pd.to_datetime("2021-04-25"), pd.to_datetime("2021-04-30"), 
         pd.to_datetime("2021-05-16"), pd.to_datetime("2021-05-31"), 
         pd.to_datetime("2021-06-13"), pd.to_datetime("2021-06-27")]

data['nut_one'] = np.zeros((data.shape[0],1))
for day in n:
    data.loc[day,'nut_one'] = 1

df = data
model = ols('TARGET ~ nestle+nut_one+lg', df).fit()

print(anova_lm(model))

# Seasonal_Decompose

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

data = od_kx[od_kx.주문유형 == 7].groupby('주문날짜')['품목수량'].sum()

decomposition = seasonal_decompose(data, model='additive')
fig = decomposition.plot()
fig.set_size_inches(10,10)
plt.show()