# Initial EDA and Cleaning
Explore and clean data in `ecom_data.csv`

## Load

In [25]:
import pandas as pd
from datetime import date

df = pd.read_csv(
    'ecom_data.csv',
    dtype={
        'SalesOrder': str,
        'SKU': str,
        'Description': str,
        'UnitPrice': float,
        'CustomerID': int,
        'Channel': str,
        'State': str,
        'Sales': float,
        'Quantity': int
    },
    converters={
        'InvoiceDay': date.fromisoformat
    })

In [44]:
df.head(10)

Unnamed: 0,SalesOrder,SKU,Description,UnitPrice,CustomerID,Channel,State,InvoiceDay,Sales,Quantity
0,580636,22474,SPACEBOY TV DINNER TRAY,1.95,16746,Mailing,IL,2011-12-05,31.2,16
1,581426,70006,LOVE HEART POCKET WARMER,0.79,17757,Organic Social,WA,2011-12-08,2.37,3
2,575063,22697,GREEN REGENCY TEACUP AND SAUCER,2.95,16764,Display,TX,2011-11-08,8.85,3
3,544065,20726,LUNCH BAG WOODLAND,1.65,14346,Organic Social,TX,2011-02-15,13.2,8
4,568896,85049E,SCANDINAVIAN REDS RIBBONS,1.25,16361,Store,NY,2011-09-29,52.5,42
5,559542,23209,LUNCH BAG DOILEY PATTERN,1.65,17126,Email,CA,2011-07-10,9.9,6
6,569868,23493,VINTAGE DOILY TRAVEL SEWING KIT,1.95,13018,Organic Social,MO,2011-10-06,15.6,8
7,575303,23321,SMALL WHITE HEART OF WICKER,1.65,12893,Store,IA,2011-11-09,13.2,8
8,567145,21154,RED RETROSPOT OVEN GLOVE,1.25,12921,Organic Social,AK,2011-09-16,10.0,8
9,574444,21967,PACK OF 12 SKULL TISSUES,0.39,18122,Store,CA,2011-11-04,39.78,102


## Clean

In [27]:
# check for duplicate SalesOrder IDs
print(f"{df.shape[0]} rows, {df.SalesOrder.unique().shape[0]} unique SalesOrder values.")

406829 rows, 20665 unique SalesOrder values.


In [33]:
# count duplicate SalesOrder values 
df.groupby('SalesOrder')\
    .agg({'SKU': 'count'})\
    .rename(columns={'SKU': 'Count'})\
    .sort_values('Count', ascending=False)

Unnamed: 0_level_0,Count
SalesOrder,Unnamed: 1_level_1
580727,571
579196,524
576339,510
573576,459
578270,445
...,...
C540938,1
C540942,1
C540944,1
C541092,1


In [29]:
# examine SalesOrder 580727
df[df.SalesOrder == '580727'].sample(20, random_state=42)

Unnamed: 0,SalesOrder,SKU,Description,UnitPrice,CustomerID,Channel,State,InvoiceDay,Sales,Quantity
368952,580727,23292,SPACEBOY CHILDRENS CUP,2.46,14096,Store,TX,2011-12-05,17.22,7
45690,580727,22755,SMALL PURPLE BABUSHKA NOTEBOOK,1.63,14096,SEO,TX,2011-12-05,11.41,7
90257,580727,23119,PACK OF 6 LARGE FRUIT STRAWS,1.25,14096,Organic Social,TX,2011-12-05,5.0,4
369275,580727,71477,COLOURED GLASS STAR T-LIGHT HOLDER,6.63,14096,Display,TX,2011-12-05,119.34,18
392125,580727,22557,PLASTERS IN TIN VINTAGE PAISLEY,3.29,14096,SEO,TX,2011-12-05,65.8,20
231983,580727,23080,RED METAL BOX TOP SECRET,16.63,14096,Email,TX,2011-12-05,99.78,6
145206,580727,22716,CARD CIRCUS PARADE,0.83,14096,SEO,TX,2011-12-05,5.81,7
25721,580727,21790,VINTAGE SNAP CARDS,1.63,14096,Organic Social,TX,2011-12-05,9.78,6
52168,580727,21901,"KEY FOB , BACK DOOR",1.63,14096,Organic Social,TX,2011-12-05,3.26,2
230723,580727,47504H,ENGLISH ROSE SPIRIT LEVEL,3.29,14096,Organic Social,TX,2011-12-05,16.45,5


In [30]:
df[df.SalesOrder == '580727'].InvoiceDay.unique()

array([datetime.date(2011, 12, 5)], dtype=object)

It seems that `SalesOrder` simply denotes sales for a specific customer on a specific day. But let's confirm that.

In [40]:
# find count of unique InvoiceDay and CustomerID values per SalesOrder value
df_so_counts = df.groupby('SalesOrder')\
                .agg({'SKU': 'count',
                      'InvoiceDay': lambda x: x.unique().shape[0],
                      'CustomerID': lambda x: x.unique().shape[0]})\
                .rename(columns={'SKU': 'Count',
                                 'InvoiceDay': 'InvoiceDays',
                                 'CustomerID': 'CustomerIDs'})\
                .sort_values('Count', ascending=False)

df_so_counts.head()

Unnamed: 0_level_0,Count,InvoiceDays,CustomerIDs
SalesOrder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
580727,571,1,1
579196,524,1,1
576339,510,1,1
573576,459,1,1
578270,445,1,1


In [43]:
print(f"> 1 InvoiceDays per SalesOrder: {(df_so_counts.InvoiceDays > 1).any()}")
print(f"> 1 CustomerIDs per SalesOrder: {(df_so_counts.CustomerIDs > 1).any()}")

> 1 InvoiceDays per SalesOrder: False
> 1 CustomerIDs per SalesOrder: False


We can reasonably infer then that `SalesOrder` is not an order ID number, but rather a sort of hash value comprised of `InvoiceDay` and `CustomerID`. This is because there are multiple `Channel` values per `SalesOrder` value, which indicates customers may have placed multiple orders per day via multiple channels.

## Explore
### Repeat Customers

In [55]:
# find customers w/> 5 sales by avg Sales amount
df_repeat_cust = df.groupby('CustomerID')\
    .agg({'SalesOrder': 'count'})\
    .rename(columns={"SalesOrder": "SalesOrders"})\
    .reset_index(drop=False)

df_repeat_cust = df_repeat_cust[df_repeat_cust.SalesOrders >= 5]

In [57]:
df_repeat_cust = df[df.CustomerID.isin(df_repeat_cust.CustomerID)]\
                    .groupby('CustomerID')\
                    .agg({'Sales': 'mean'})\
                    .reset_index(drop=False)\
                    .sort_values('Sales', ascending=False)

In [58]:
df_repeat_cust.head(15)

Unnamed: 0,CustomerID,Sales
2834,16446,51116.834
1899,15098,14773.8
2522,16000,5430.133333
2353,15749,5302.272727
350,12798,4110.322
3850,17949,3279.557013
1755,14887,2111.126
1546,14566,2089.646
2044,15299,2074.853636
3784,17857,1866.871587


In [59]:
df_repeat_cust.tail(15)

Unnamed: 0,CustomerID,Sales
3573,17548,-6.482222
3820,17900,-13.577778
3927,18072,-20.49
2705,16252,-32.511176
2745,16321,-32.77
1125,13958,-44.1375
927,13672,-48.248889
2406,15823,-80.038
615,13217,-85.973333
3612,17603,-132.952


So we can have negative `Sales` values.

### Negative Sales

In [60]:
df.sort_values('Sales', ascending=True).head(20)

Unnamed: 0,SalesOrder,SKU,Description,UnitPrice,CustomerID,Channel,State,InvoiceDay,Sales,Quantity
91934,C581484,23843,"PAPER CRAFT , LITTLE BIRDIE",2.08,16446,Organic Social,NC,2011-12-09,-127788.96,-61437
339077,C573079,M,Manual,4161.06,12536,Store,WI,2011-10-27,-70738.02,-17
96845,C573079,M,Manual,4161.06,12536,Store,WI,2011-10-27,-70738.02,-17
98912,C541433,23166,MEDIUM CERAMIC TOP STORAGE JAR,1.04,12346,Organic Social,UT,2011-01-18,-60425.04,-58101
35569,C556445,M,Manual,38970.0,15098,Organic Social,TX,2011-06-10,-38970.0,-1
393764,C551685,POST,POSTAGE,8142.75,16029,Store,NM,2011-05-03,-16285.5,-2
168701,C551685,POST,POSTAGE,8142.75,16029,Store,NM,2011-05-03,-16285.5,-2
274735,C570556,22273,FELTCRAFT DOLL MOLLY,2.55,16029,Store,NM,2011-10-11,-15797.25,-6195
183370,C560647,M,Manual,3060.6,18102,Store,FL,2011-07-20,-15303.0,-5
78300,C566925,M,Manual,1829.84,12748,Store,CA,2011-09-15,-14638.72,-8


In [68]:
# are SalesOrder values that start with a C all negative?
df_sc = df[df.SalesOrder.apply(lambda x: x[0] == 'C')]
(df_sc.Sales < 0).all()

False

In [69]:
# are SalesOrder values that start with a C all negative or 0?
(df_sc.Sales <= 0).all()

True

In [71]:
# are SalesOrder values that don't start with a C all positive?
df_nc = df[df.SalesOrder.apply(lambda x: x[0] != 'C')]
(df_nc.Sales > 0).all()

True

In [72]:
# are SalesOrder values that don't start with a C all positive or 0?
df_nc = df[df.SalesOrder.apply(lambda x: x[0] != 'C')]
(df_nc.Sales >= 0).all()

True

I suppose we can infer that `SalesOrder` values that start with a C are acquisitions? And that those without a C are customer sales?

In [80]:
# examine all positive orders for PAPER CRAFT , LITTLE BIRDIE
df[(df.SKU == '23843') & (df.Sales >= 0)]

Unnamed: 0,SalesOrder,SKU,Description,UnitPrice,CustomerID,Channel,State,InvoiceDay,Sales,Quantity
216014,581483,23843,"PAPER CRAFT , LITTLE BIRDIE",2.08,16446,Organic Social,NC,2011-12-09,127788.96,61437
228465,581483,23843,"PAPER CRAFT , LITTLE BIRDIE",2.08,16446,Organic Social,NC,2011-12-09,127788.96,61437
375304,581483,23843,"PAPER CRAFT , LITTLE BIRDIE",2.08,16446,Organic Social,NC,2011-12-09,127788.96,61437


In [83]:
# examine all negative orders for PAPER CRAFT , LITTLE BIRDIE
df[(df.SKU == '23843') & (df.Sales <= 0)]

Unnamed: 0,SalesOrder,SKU,Description,UnitPrice,CustomerID,Channel,State,InvoiceDay,Sales,Quantity
91934,C581484,23843,"PAPER CRAFT , LITTLE BIRDIE",2.08,16446,Organic Social,NC,2011-12-09,-127788.96,-61437


Not sure what's going on. Looks like there could be duplicate orders, or mistakenly entered orders? For now, I think we can focus on positive value sales. I think we can also dedupe the data for identical rows. It's possible that a customer could've ordered over 180,000 little birdies in one day. But I think it must be a mistake.

In [85]:
df.drop_duplicates(inplace=True)

I think that's good for an inital pass at exploring and cleaning. Let's kick this over to a new notebook (`eda.ipynb`).

**Note**: John M. clarified that sales with negative amounts are refunds, and that `SalesOrder` is a Sale ID. The number of different channels for a sale can be exaggerated as it's a manufactured dataset.