In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

In [None]:
data = pd.read_csv('PurchasesFINAL12312016-1.csv')

data.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 불러오기
df = pd.read_csv("PurchasesFINAL12312016-1.csv")

# 날짜형 변환
date_cols = ["PODate", "ReceivingDate", "InvoiceDate", "PayDate"]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# 처리 기간 계산
df["PO_to_Receiving"] = (df["ReceivingDate"] - df["PODate"]).dt.days
df["Receiving_to_Invoice"] = (df["InvoiceDate"] - df["ReceivingDate"]).dt.days
df["Invoice_to_Pay"] = (df["PayDate"] - df["InvoiceDate"]).dt.days
df["PO_to_Pay"] = (df["PayDate"] - df["PODate"]).dt.days

# 비정상 데이터 제거
df = df[(df["PO_to_Pay"] >= 0) & (df["PO_to_Receiving"] >= 0)]

#  1. 전체 처리 기간 분포
plt.figure(figsize=(8,5))
plt.hist(df["PO_to_Pay"], bins=50, color='skyblue', edgecolor='black')
plt.title("Distribution of Total Processing Time (PO to Pay)")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

#  2. 단계별 처리기간 Boxplot
plt.figure(figsize=(8,6))
sns.boxplot(data=df[["PO_to_Receiving","Receiving_to_Invoice","Invoice_to_Pay"]], palette="pastel")
plt.title("Processing Time Distribution by Step")
plt.ylabel("Days")
plt.show()

#  3. 월별 평균 처리기간 추세
df["Month"] = df["PODate"].dt.to_period("M").astype(str)
monthly = df.groupby("Month")["PO_to_Pay"].mean()
plt.figure(figsize=(10,5))
monthly.plot(marker='o', color='green')
plt.title("Average PO-to-Pay Duration by Month")
plt.xlabel("Month")
plt.ylabel("Average Days")
plt.xticks(rotation=45)
plt.show()

#  4. 주요 벤더별 처리시간 분포
top_vendors = df["VendorName"].value_counts().head(5).index
plt.figure(figsize=(10,6))
sns.boxplot(data=df[df["VendorName"].isin(top_vendors)], x="VendorName", y="PO_to_Pay", palette="Set2")
plt.title("Processing Time Distribution by Top 5 Vendors")
plt.ylabel("Days")
plt.xlabel("Vendor")
plt.xticks(rotation=30)
plt.show()
