# Importing libraries

In [20]:
import pandas as pd
import zipfile

# Importing data

In [21]:
# Unziping file
with zipfile.ZipFile('../data/ecommerce-data.zip') as zip_file:
    zip_file.extractall('../data/raw/')

In [None]:
df = pd.read_csv('../data/data.csv', encoding='latin1')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


# Dataset overview

In [23]:
print(f'The dataset consists in {df.shape[0]} rows and {df.shape[1]} columns.')

The dataset consists in 541909 rows and 8 columns.


Checking how many unique values each column has:

In [24]:
dict_unique_values = {}
for col in df.columns:
    dict_unique_values[col] = df[col].nunique()

summary_df = pd.DataFrame(dict_unique_values, index=['Unique values']).T
summary_df = summary_df.reset_index().rename(columns={'index': 'Columns'})

summary_df

Unnamed: 0,Columns,Unique values
0,InvoiceNo,25900
1,StockCode,4070
2,Description,4223
3,Quantity,722
4,InvoiceDate,23260
5,UnitPrice,1630
6,CustomerID,4372
7,Country,38


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


- Despite `InvoiceNo` and `StockCode` seem numerical, both have some values with interspersed characters, so the data type as string is correct. 
- `CustomerID` is indeed a numerical column, but it represents an ID column. It is better to use it as integer values, not float.
- For `InvoiceDate`, we will change its type to a properly date type later, in data cleaning step.
- We can notice some null values in `Description` and `CustomerID` columns.

## Checking missing values

In [26]:
df[df['Description'].isnull()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,12/1/2010 11:52,0.0,,United Kingdom
1970,536545,21134,,1,12/1/2010 14:32,0.0,,United Kingdom
1971,536546,22145,,1,12/1/2010 14:33,0.0,,United Kingdom
1972,536547,37509,,1,12/1/2010 14:33,0.0,,United Kingdom
1987,536549,85226A,,1,12/1/2010 14:34,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
535322,581199,84581,,-2,12/7/2011 18:26,0.0,,United Kingdom
535326,581203,23406,,15,12/7/2011 18:31,0.0,,United Kingdom
535332,581209,21620,,6,12/7/2011 18:35,0.0,,United Kingdom
536981,581234,72817,,27,12/8/2011 10:33,0.0,,United Kingdom


We will keep rows with missing `CustomerID` values as they can represent purchases/transactions with no registered customer.
For `Description`, we will later fill based in the `StockCode` (if found a present descrption in another row from that ID) and, for that not found, fill with "UNKNOWN PRODUCT".

Let's check if there's any duplicated values.

In [27]:
print(f'The dataset contains {df.duplicated().sum()} duplicated values.')

The dataset contains 5268 duplicated values.


In [28]:
df[df.duplicated(keep=False)].sort_values(by=['InvoiceNo', 'StockCode'])

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
494,536409,21866,UNION JACK FLAG LUGGAGE TAG,1,12/1/2010 11:45,1.25,17908.0,United Kingdom
517,536409,21866,UNION JACK FLAG LUGGAGE TAG,1,12/1/2010 11:45,1.25,17908.0,United Kingdom
485,536409,22111,SCOTTIE DOG HOT WATER BOTTLE,1,12/1/2010 11:45,4.95,17908.0,United Kingdom
539,536409,22111,SCOTTIE DOG HOT WATER BOTTLE,1,12/1/2010 11:45,4.95,17908.0,United Kingdom
489,536409,22866,HAND WARMER SCOTTY DOG DESIGN,1,12/1/2010 11:45,2.10,17908.0,United Kingdom
...,...,...,...,...,...,...,...,...
440149,C574510,22360,GLASS JAR ENGLISH CONFECTIONERY,-1,11/4/2011 13:25,2.95,15110.0,United Kingdom
461407,C575940,23309,SET OF 60 I LOVE LONDON CAKE CASES,-24,11/13/2011 11:38,0.55,17838.0,United Kingdom
461408,C575940,23309,SET OF 60 I LOVE LONDON CAKE CASES,-24,11/13/2011 11:38,0.55,17838.0,United Kingdom
529980,C580764,22667,RECIPE BOX RETROSPOT,-12,12/6/2011 10:38,2.95,14562.0,United Kingdom


Since the duplicated rows consists in exact same values in all columns, we will later remove them.

## Numerical values

In [29]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


In [30]:
df[df['Quantity'] < -5000]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
4287,C536757,84347,ROTATING SILVER ANGELS T-LIGHT HLDR,-9360,12/2/2010 14:23,0.03,15838.0,United Kingdom
61624,C541433,23166,MEDIUM CERAMIC TOP STORAGE JAR,-74215,1/18/2011 10:17,1.04,12346.0,United Kingdom
115818,546152,72140F,throw away,-5368,3/9/2011 17:25,0.0,,United Kingdom
225528,556687,23003,Printing smudges/thrown away,-9058,6/14/2011 10:36,0.0,,United Kingdom
225529,556690,23005,printing smudges/thrown away,-9600,6/14/2011 10:37,0.0,,United Kingdom
225530,556691,23005,printing smudges/thrown away,-9600,6/14/2011 10:37,0.0,,United Kingdom
540422,C581484,23843,"PAPER CRAFT , LITTLE BIRDIE",-80995,12/9/2011 9:27,2.08,16446.0,United Kingdom


In [31]:
df[df['UnitPrice'] < 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
299983,A563186,B,Adjust bad debt,1,8/12/2011 14:51,-11062.06,,United Kingdom
299984,A563187,B,Adjust bad debt,1,8/12/2011 14:52,-11062.06,,United Kingdom


It may be weird finding negative `Quantity` values, but they represent returned purchases or discarded products, and does not need to be dropped or handled at all.
In the case of `UnitPrice`, as the description tells, they represent "adjusts", probably for accounting purposes

## Cathegorical values

Checking if there's any misspelled country name:

In [32]:
countries = df['Country'].unique()
countries.sort()
countries

array(['Australia', 'Austria', 'Bahrain', 'Belgium', 'Brazil', 'Canada',
       'Channel Islands', 'Cyprus', 'Czech Republic', 'Denmark', 'EIRE',
       'European Community', 'Finland', 'France', 'Germany', 'Greece',
       'Hong Kong', 'Iceland', 'Israel', 'Italy', 'Japan', 'Lebanon',
       'Lithuania', 'Malta', 'Netherlands', 'Norway', 'Poland',
       'Portugal', 'RSA', 'Saudi Arabia', 'Singapore', 'Spain', 'Sweden',
       'Switzerland', 'USA', 'United Arab Emirates', 'United Kingdom',
       'Unspecified'], dtype=object)

In [33]:
df['Description'].value_counts()

Description
WHITE HANGING HEART T-LIGHT HOLDER    2369
REGENCY CAKESTAND 3 TIER              2200
JUMBO BAG RED RETROSPOT               2159
PARTY BUNTING                         1727
LUNCH BAG RED RETROSPOT               1638
                                      ... 
ANT SILVER TURQUOISE BOUDICCA RING       1
Damages                                  1
Found by jackie                          1
water damaged                            1
lost in space                            1
Name: count, Length: 4223, dtype: int64

Let's check if there's `StockCode` with more than one different `Description` value

In [34]:
unique_descriptions = df.groupby('StockCode')['Description'].nunique()
problematic_products = unique_descriptions[unique_descriptions > 1]
problematic_products.sort_values(ascending=False)

StockCode
20713       8
23084       7
21830       6
85175       6
23131       5
           ..
90195A      2
90210D      2
DCGS0003    2
DCGS0069    2
85215       2
Name: Description, Length: 650, dtype: int64

We will choose to standardize the descriptions and keep the most frequent ones on the data cleaning step.

In [35]:
final_desc = (
    df.groupby(['StockCode', 'Description'])
    .size()
    .reset_index(name='count')
    .sort_values(['StockCode', 'count'], ascending=[True, False])
    .drop_duplicates('StockCode')
)
final_desc.head()

Unnamed: 0,StockCode,Description,count
0,10002,INFLATABLE POLITICAL GLOBE,71
1,10080,GROOVY CACTUS INFLATABLE,22
3,10120,DOGGY RUBBER,30
4,10123C,HEARTS WRAPPING TAPE,3
5,10124A,SPOTS ON RED BOOKCOVER TAPE,5


## Date values

In [36]:
df.sort_values(by='InvoiceDate')

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
50806,540558,21258,?,-29,1/10/2011 10:04,0.00,,United Kingdom
50807,C540559,21888,BINGO SET,-4,1/10/2011 10:07,3.75,,EIRE
50808,540560,21589,,-14,1/10/2011 10:08,0.00,,United Kingdom
50828,540561,21385,IVORY HANGING DECORATION HEART,24,1/10/2011 10:32,0.85,13004.0,United Kingdom
50815,540561,21232,STRAWBERRY CERAMIC TRINKET BOX,12,1/10/2011 10:32,1.25,13004.0,United Kingdom
...,...,...,...,...,...,...,...,...
332549,566079,20838,FRENCH LATTICE CUSHION COVER,12,9/9/2011 9:52,0.85,17593.0,United Kingdom
332550,566079,22400,MAGNETS PACK OF 4 HOME SWEET HOME,24,9/9/2011 9:52,0.39,17593.0,United Kingdom
332551,566079,22396,MAGNETS PACK OF 4 RETRO PHOTO,24,9/9/2011 9:52,0.39,17593.0,United Kingdom
332571,566079,22923,FRIDGE MAGNETS LES ENFANTS ASSORTED,24,9/9/2011 9:52,0.85,17593.0,United Kingdom


In [19]:
df.sample(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
43568,540098,21621,VINTAGE UNION JACK BUNTING,1,1/4/2011 15:50,8.5,16241.0,United Kingdom
271321,560689,23209,LUNCH BAG DOILEY PATTERN,2,7/20/2011 11:55,1.65,15039.0,United Kingdom
123570,546899,22386,JUMBO BAG PINK POLKADOT,30,3/17/2011 18:27,1.65,14298.0,United Kingdom
184175,552677,22668,PINK BABY BUNTING,2,5/10/2011 15:07,5.79,,United Kingdom
68411,541866,21985,PACK OF 12 HEARTS DESIGN TISSUES,24,1/24/2011 9:25,0.29,16477.0,United Kingdom
47254,540402,82581,TOILET METAL SIGN,48,1/7/2011 9:31,0.55,16567.0,United Kingdom
262092,559898,22789,T-LIGHT HOLDER SWEETHEART HANGING,4,7/13/2011 12:18,1.95,16225.0,United Kingdom
40297,539735,37370,RETRO COFFEE MUGS ASSORTED,1,12/21/2010 15:17,16.13,,United Kingdom
101630,544928,23000,TRAVEL CARD WALLET TRANSPORT,2,2/24/2011 18:07,0.83,,United Kingdom
26758,538523,21843,RED RETROSPOT CAKE STAND,1,12/13/2010 9:29,10.95,13198.0,United Kingdom
