In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date

In [2]:
orderproducts = pd.read_csv('../data-raw/orderproducts.csv', parse_dates=[1,11], infer_datetime_format=True)

In [3]:
products = pd.read_csv('../data-raw/products.csv')

In [4]:
# remove duplicated sku names
products = products.drop_duplicates(subset = ['sku'], keep='first')

In [5]:
# Create date field, remove time details
orderproducts['order_date'] = [x.date() for x in orderproducts['order_created_at']]  # remove time, display only date
orderproducts['order_date'] = pd.to_datetime(orderproducts['order_date'])

In [6]:
# Extract the product price for individual item in the list
arr, emp = [], []
for i, row  in orderproducts.iterrows():
    if row.product_sku in list(products.sku):
        arr.append(float(products.loc[products['sku'] == row.product_sku, 'price_usd'].values))
    else:
        emp.append(row.product_sku)
        arr.append(0)
orderproducts['product_price'] = pd.DataFrame(arr)

In [7]:
# list of product skus that is not listed in products.csv
print(emp)

['STICKER', 'SRB-L-BLK', nan, 'CPN-BAG-STRAP-DPS-BLK', 'SRB-L-BLK', 'STICKER', 'SRB-L-BLK', 'SRB-L-BLK', 'CPN-BAG-STRAP-DPS-BLK', nan, nan, 'B-M80-AC-BLK', 'CPN-BAG-STRAP-DPS-BLK', '50-K61-FOAM', 'B-M80-SAD-BLK', 'B-M80-EB-BLK', nan, 'B-M80-VEG-GRY', 'B-M80-EB-BLK', nan, nan, nan, nan, nan]


In [8]:
orderproducts = orderproducts[['order_id', 'order_date', 'order_total_price', 'product_price', 'order_discounts',
                               'product_discount', 'order_status', 'product_title', 'product_sku', 'product_quantity',
                               'product_category']]

In [9]:
product_sku = orderproducts.groupby(['product_sku'])['product_quantity'].agg(['sum']).sort_values('sum', ascending=False).reset_index()

In [10]:
top_20 = list(product_sku['product_sku'].values)[0:20]

In [11]:
top_20

['M80-TICK-V2-BLK',
 'M80-VEG-BLK',
 'EFX-FLY-BLK',
 'M80-EG-BLK',
 'M80-BTY-BLK-L',
 'M80-2G-BLK',
 'M80-VEB-BLK',
 'M80-BTY-BLK-S',
 'M80-EB-BLK',
 'M80-AD-BLK',
 'M80-SEG-BLK',
 'M80-VAD-BLK',
 'M80-SEG-ASH',
 'M80-VHB-BLK',
 'M80-TOUR-V2-BLK',
 'M80-2B-BLK',
 'M80-AC-BLK',
 'M80-VEB-GRY',
 'M80-VEG-GRY',
 'M80-SEB-BLK']

In [12]:
# filter orders where product_sku is in top 20 list
df = orderproducts[orderproducts['product_sku'].isin(top_20)]

In [13]:
df.to_csv('../data-processed/orderproducts_top20.csv', index=False)