In [1]:
import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format)
import warnings
warnings.filterwarnings("ignore")

In [2]:
sku = pd.read_csv("skuinfo2.csv")

In [3]:
skuinfo = pd.read_csv("skuinfo2.csv")
skuinfo.shape

(1556033, 10)

In [4]:
skuinfo.head()

Unnamed: 0,SKU,DEPT,CLASSID,UPC,STYLE,COLOR,SIZE,PACKSIZE,VENDOR,BRAND
0,3,6505,113,400000003000,00 F55KT2,WHISPERWHITE,P8EA,1,5119207,TURNBURY
1,4,8101,002,400000004000,22 615CZ4,SPEARMI,S,1,3311144,C A SPOR
2,5,7307,003,400000005000,7LBS 245-01,34 SILVER,KING,1,5510554,BEAU IDE
3,8,3404,00B,400000008000,622 F05H84,MORNING MI,2T,1,2912827,HARTSTRI
4,15,2301,004,400000015000,126 MDU461,255CAMEL,12,1,23272,JONES/LA


In [5]:
skstinfo = pd.read_csv("skstinfo.csv", header = None, usecols = [0, 2, 3])

In [6]:
skstinfo.shape

(39230146, 3)

In [7]:
skstinfo.rename(columns = {0: "SKU",
                          2: "COST",
                          3: "RETAIL"}, inplace = True)

In [8]:
trnsact = pd.read_csv("trnsact.csv", header = None, usecols = [0, 3, 5, 6, 7, 9])

In [9]:
trnsact.rename(columns = {0: "SKU", 3: "TRANNUM", 5: "SALEDATE", 6: "STYPE", 7: "QUANTITY", 9: "AMT"}, inplace = True)

In [10]:
# Select random 10% subsamples from original tables because the original tables are too large to perform joins
f_transact = trnsact.sample(round(len(trnsact) * 0.10))
f_sku = sku.sample(round(len(sku) * 0.10))
f_skst = skstinfo.sample(round(len(skstinfo) * 0.10))

In [11]:
# Join the three subsamples on SKU
merged_dfs = pd.merge(f_sku, f_skst, on = "SKU", how = "inner")

In [12]:
merged_dfs.shape

(393162, 12)

In [13]:
f_merged = pd.merge(merged_dfs, f_transact, on ="SKU", how = "inner")

In [14]:
f_merged.shape

(15141554, 17)

In [15]:
final_data = f_merged[["SKU", "VENDOR", "COST", "RETAIL", "TRANNUM", "SALEDATE", "STYPE", "QUANTITY", "AMT"]]

In [16]:
final_data.shape

(15141554, 9)

In [17]:
final_data.head()

Unnamed: 0,SKU,VENDOR,COST,RETAIL,TRANNUM,SALEDATE,STYPE,QUANTITY,AMT
0,6628905,212785,14.0,39.5,500,2005-06-22,R,1,39.5
1,6628905,212785,14.0,39.5,1000,2005-06-12,P,1,39.5
2,6628905,212785,14.0,39.5,1400,2005-05-09,R,1,39.5
3,6628905,212785,14.0,39.5,3300,2005-08-12,P,1,39.5
4,6628905,212785,14.0,39.5,900,2005-04-01,P,1,39.5


In [18]:
# Calculate average cost per vendor
final_data["total_cost"] = final_data["COST"] * final_data["QUANTITY"]
sub = final_data[["SKU", "VENDOR", "total_cost"]]
sub = sub.groupby("VENDOR").agg(CNT = ("SKU", "size"), TOTAL_COST = ("total_cost", "sum"))
sub["AVG_COST"] = sub["TOTAL_COST"] / sub["CNT"]
sub = sub.drop(columns=["CNT", "TOTAL_COST"])

final_data = final_data.drop(columns=["total_cost", "COST", "TRANNUM"])
final_data = final_data.merge(sub, how="left", on="VENDOR")

In [19]:
def categorize(row, row_val):
    if row["STYPE"] == row_val:
        return 1
    else:
        return None
final_data['Purchases'] = final_data.apply(lambda row: categorize(row, "P"), axis=1)
final_data["Returns"] = final_data.apply(lambda row: categorize(row, "R"), axis = 1)

In [20]:
# Calculate recency of purchase
sub2 = final_data[final_data["STYPE"] == "P"]
sub2 = sub2.groupby("VENDOR").agg(RECENCY = ("SALEDATE", "max"), FREQUENCY = ("SKU", "size"))

final_data = final_data.merge(sub2, how="left", on="VENDOR")

In [22]:
# Caluclate the profit for each SKU based on the sale type
final_data.loc[final_data.STYPE == "P", "PROFIT"] = (final_data["AMT"] - final_data["AVG_COST"]) * final_data["QUANTITY"]
final_data.loc[final_data.STYPE == "R", "PROFIT"] = (-final_data["AMT"] - final_data["AVG_COST"]) * final_data["QUANTITY"]

In [23]:
# Aggregate rows with the same VENDOR
f_data = final_data.groupby("VENDOR").agg(FREQUENCY = ("FREQUENCY", "max"),
                                          RECENCY = ("RECENCY", "max"),
                                          AVG_COST = ("AVG_COST", "max"),
                                          RETAIL = ("RETAIL", "sum"),
                                          QUANTITY = ("QUANTITY", "sum"),
                                          AMT = ("AMT", "sum"),
                                          PROFIT = ("PROFIT", "sum"),
                                          PURCHASES = ("Purchases", "sum"),
                                          RETURNS = ("Returns", "sum")).sort_values("FREQUENCY", ascending = False)

In [24]:
# Drop vendors with only records of return
f_data = f_data.dropna(subset=["FREQUENCY", "RECENCY"])

In [27]:
f_data.head()

Unnamed: 0_level_0,FREQUENCY,RECENCY,AVG_COST,RETAIL,QUANTITY,AMT,PROFIT,PURCHASES,RETURNS
VENDOR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5511283,2611136.0,2005-08-27,12.05,54582096.5,2718062,53552128.27,16300246.47,2611136.0,106926.0
113645,715334.0,2005-08-27,21.7,27045785.04,749501,26783015.0,8056891.0,715334.0,34167.0
3626213,652779.0,2005-08-27,1.52,2728934.28,686915,2286423.08,1003410.36,652779.0,34136.0
13031,411434.0,2005-08-27,2.38,1944368.04,423235,2014154.96,889271.28,411434.0,11801.0
5715232,359934.0,2005-08-27,31.9,12943467.69,400013,17426741.6,901940.29,359934.0,40079.0


In [None]:
# Uncomment to save data to your local storage
#f_data.to_csv("group7_data.csv")