## Dataset source: https://archive.ics.uci.edu/dataset/352/online+retail

In [21]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
# Read the csv file into a dataframe
df = pd.read_excel("Resources/online_retail.xlsx")
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [23]:
# Disply the bottom five rows of the dataframe
df.tail()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


In [24]:
# display information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


## Missing Values

In [25]:
# find missing values
df.isnull().sum()


InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [26]:
# Drop the rows with missing values in  "Dexcription" column
df = df.dropna(subset=["Description"])
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     133626
Country             0
dtype: int64

In [27]:
# Display the rows with missing customer IDs
cust_missing = df[df["CustomerID"].isnull()]
cust_missing.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,2010-12-01 14:32:00,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,2010-12-01 14:32:00,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4,2010-12-01 14:32:00,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2,2010-12-01 14:32:00,1.66,,United Kingdom
1447,536544,21790,VINTAGE SNAP CARDS,9,2010-12-01 14:32:00,1.66,,United Kingdom


In [28]:
# check the min value of the "CustomerID" column
df["CustomerID"].min()

12346.0

In [29]:
# create copy of thedataframe
df1 = df.copy()

In [30]:
# Fill the missing values in the "CustomerID" column with 1001 representing GUEST CHECKOUT
df1["CustomerID"] = df1["CustomerID"].fillna(1001)
df1.isnull().sum()


InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

## Duplicated Rows

In [31]:
# show duplicate rows
df1[df1.duplicated()]


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
517,536409,21866,UNION JACK FLAG LUGGAGE TAG,1,2010-12-01 11:45:00,1.25,17908.0,United Kingdom
527,536409,22866,HAND WARMER SCOTTY DOG DESIGN,1,2010-12-01 11:45:00,2.10,17908.0,United Kingdom
537,536409,22900,SET 2 TEA TOWELS I LOVE LONDON,1,2010-12-01 11:45:00,2.95,17908.0,United Kingdom
539,536409,22111,SCOTTIE DOG HOT WATER BOTTLE,1,2010-12-01 11:45:00,4.95,17908.0,United Kingdom
555,536412,22327,ROUND SNACK BOXES SET OF 4 SKULLS,1,2010-12-01 11:49:00,2.95,17920.0,United Kingdom
...,...,...,...,...,...,...,...,...
541675,581538,22068,BLACK PIRATE TREASURE CHEST,1,2011-12-09 11:34:00,0.39,14446.0,United Kingdom
541689,581538,23318,BOX OF 6 MINI VINTAGE CRACKERS,1,2011-12-09 11:34:00,2.49,14446.0,United Kingdom
541692,581538,22992,REVOLVER WOODEN RULER,1,2011-12-09 11:34:00,1.95,14446.0,United Kingdom
541699,581538,22694,WICKER STAR,1,2011-12-09 11:34:00,2.10,14446.0,United Kingdom


In [32]:
# check the number of unique values in each column
df1.nunique()


InvoiceNo      24446
StockCode       3958
Description     4223
Quantity         671
InvoiceDate    22309
UnitPrice       1630
CustomerID      4373
Country           38
dtype: int64

In [33]:
# Get the column names
df1.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

## Create Customer, Product, Transaction Tables 

In [34]:
# show all rows where stockcode = 21866
df1[df1["StockCode"] == 21866]


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
494,536409,21866,UNION JACK FLAG LUGGAGE TAG,1,2010-12-01 11:45:00,1.25,17908.0,United Kingdom
517,536409,21866,UNION JACK FLAG LUGGAGE TAG,1,2010-12-01 11:45:00,1.25,17908.0,United Kingdom
6308,536876,21866,UNION JACK FLAG LUGGAGE TAG,1,2010-12-03 11:36:00,2.51,1001.0,United Kingdom
6864,536982,21866,UNION JACK FLAG LUGGAGE TAG,8,2010-12-03 14:27:00,3.36,1001.0,United Kingdom
12236,537371,21866,UNION JACK FLAG LUGGAGE TAG,8,2010-12-06 12:47:00,1.25,15028.0,United Kingdom
...,...,...,...,...,...,...,...,...
528914,580730,21866,UNION JACK FLAG LUGGAGE TAG,2,2011-12-05 17:28:00,2.46,1001.0,United Kingdom
532212,580983,21866,UNION JACK FLAG LUGGAGE TAG,3,2011-12-06 16:26:00,2.46,1001.0,United Kingdom
534838,581173,21866,UNION JACK FLAG LUGGAGE TAG,1,2011-12-07 15:07:00,1.25,17870.0,United Kingdom
536518,581219,21866,UNION JACK FLAG LUGGAGE TAG,1,2011-12-08 09:28:00,2.46,1001.0,United Kingdom


In [None]:
#Create product table
product_df = df1[["StockCode", "Description", "UnitPrice"]]
product_df.head()

Unnamed: 0,StockCode,Description,UnitPrice
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,2.55
1,71053,WHITE METAL LANTERN,3.39
2,84406B,CREAM CUPID HEARTS COAT HANGER,2.75
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,3.39
4,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39


In [35]:
# Drop duplicate StockCode
product_df = product_df.drop_duplicates(subset=["StockCode"])
product_df.head()

Unnamed: 0,StockCode,Description,UnitPrice
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,2.55
1,71053,WHITE METAL LANTERN,3.39
2,84406B,CREAM CUPID HEARTS COAT HANGER,2.75
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,3.39
4,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39


In [36]:
# Check product table info
product_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3958 entries, 0 to 540421
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   StockCode    3958 non-null   object 
 1   Description  3958 non-null   object 
 2   UnitPrice    3958 non-null   float64
dtypes: float64(1), object(2)
memory usage: 123.7+ KB


In [37]:
#Create customer table
customer_df = df1[["CustomerID", "Country"]]
customer_df.head()

Unnamed: 0,CustomerID,Country
0,17850.0,United Kingdom
1,17850.0,United Kingdom
2,17850.0,United Kingdom
3,17850.0,United Kingdom
4,17850.0,United Kingdom


In [38]:
# Drop duplicate customerID
customer_df = customer_df.drop_duplicates(subset=["CustomerID"])
customer_df.head()


Unnamed: 0,CustomerID,Country
0,17850.0,United Kingdom
9,13047.0,United Kingdom
26,12583.0,France
46,13748.0,United Kingdom
65,15100.0,United Kingdom


In [39]:
# Check customer table info
customer_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4373 entries, 0 to 541768
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CustomerID  4373 non-null   float64
 1   Country     4373 non-null   object 
dtypes: float64(1), object(1)
memory usage: 102.5+ KB


In [40]:
# Create transaction table
transaction_df = df1[["InvoiceNo", "StockCode", "Quantity", "InvoiceDate", "CustomerID"]]
transaction_df.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,CustomerID
0,536365,85123A,6,2010-12-01 08:26:00,17850.0
1,536365,71053,6,2010-12-01 08:26:00,17850.0
2,536365,84406B,8,2010-12-01 08:26:00,17850.0
3,536365,84029G,6,2010-12-01 08:26:00,17850.0
4,536365,84029E,6,2010-12-01 08:26:00,17850.0


In [41]:
# Save the product, customer and transaction table to csv files
product_df.to_csv("output/product.csv", index=False)
customer_df.to_csv("output/customer.csv", index=False)
transaction_df.to_csv("output/transaction.csv", index=False)