In [1]:
import psycopg2
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
# NEVER share passwords
# this only works with a .env file, you can hardcode the values for now


import os

from dotenv import load_dotenv
load_dotenv()  # only for local testing


dbname = os.getenv("dbname")
host = os.getenv("host")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")

In [3]:
# connect to redshift

connect = psycopg2.connect(
        dbname=dbname, host=host, port=port, user=user, password=password
    )

cursor = connect.cursor()

In [4]:
# query all rows and columns from online_transactions table

query = """select *
           from bootcamp.stock_description
           limit 10
           """

pd.read_sql(query, connect)

Unnamed: 0,stock_code,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE
5,10124G,ARMY CAMO BOOKCOVER TAPE
6,10125,MINI FUNKY DESIGN TAPES
7,10133,COLOURING PENCILS BROWN TUBE
8,10135,COLOURING PENCILS BROWN TUBE
9,11001,ASSTD DESIGN RACING CAR PEN


In [5]:
# query all rows and columns from online_transactions table

query = """select *
           from bootcamp.online_transactions
           """

online_transactions = pd.read_sql(query, connect)
online_transactions

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
0,536365,84029G,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom
1,536366,22633,6,2010-12-01 08:28:00,1.85,u1785,United Kingdom
2,536368,22912,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom
3,536367,22748,6,2010-12-01 08:34:00,2.10,u13047,United Kingdom
4,536367,22623,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom
...,...,...,...,...,...,...,...
541905,581586,23275,24,2011-12-09 12:49:00,1.25,u13113,United Kingdom
541906,581587,22556,12,2011-12-09 12:50:00,1.65,u1268,France
541907,581587,22726,4,2011-12-09 12:50:00,3.75,u1268,France
541908,581587,23256,4,2011-12-09 12:50:00,4.15,u1268,France


In [6]:
# query all rows and columns from stock_description table

query = """select *
           from bootcamp.stock_description
           """

stock_description = pd.read_sql(query, connect)
stock_description


Unnamed: 0,stock_code,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE
...,...,...
3947,DCGSSGIRL,GIRLS PARTY BAG
3948,DOT,DOTCOM POSTAGE
3949,PADS,PADS TO MATCH ALL CUSHIONS
3950,POST,POSTAGE


In [7]:
query = """select count(*)
           from bootcamp.online_transactions
           where customer_id = ''
           """

pd.read_sql(query, connect)


Unnamed: 0,count
0,135080


In [8]:
type(online_transactions)

pandas.core.frame.DataFrame

In [9]:
type(stock_description)

pandas.core.frame.DataFrame

In [10]:
online_transactions.shape

# 541k rows and 7 columns

(541910, 7)

In [11]:
stock_description.shape

# 4k rows and 2 columns

(3952, 2)

In [12]:
# the number of unique stock codes
# using python to calculate this

len(stock_description.stock_code.unique())

3905

In [13]:
# option 2 - using sql

query = """select count(distinct stock_code)
           from bootcamp.online_transactions
           """

pd.read_sql(query, connect)

Unnamed: 0,count
0,4070


In [14]:
# how many stocks do we have in the online transactions table
# option 1

len(online_transactions.stock_code.unique())

# question: which stock codes are missing from the stock description table?


4070

In [15]:
# we have 165 stock codes potentially where we don't have the description
len(online_transactions.stock_code.unique()) - len(stock_description.stock_code.unique())

165

In [16]:
# check for duplicated data

online_transactions.duplicated().sum()

# 5270 rows of duplicated data

5270

In [17]:
# % of rows with duplicated data
online_transactions.duplicated().sum()/online_transactions.shape[0]*100

0.972486206196601

In [18]:
# preview the duplicated data

online_transactions[online_transactions.duplicated()]

# we can either drop all the duplicated data or keep only one

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
294,536464,22866,1,2010-12-01 12:23:00,2.10,u17968,United Kingdom
375,536412,22327,1,2010-12-01 11:49:00,2.95,u1792,United Kingdom
377,536412,22273,1,2010-12-01 11:49:00,2.95,u1792,United Kingdom
392,536412,21708,1,2010-12-01 11:49:00,4.95,u1792,United Kingdom
393,536412,85184C,1,2010-12-01 11:49:00,2.95,u1792,United Kingdom
...,...,...,...,...,...,...,...
541479,581471,21411,2,2011-12-08 19:29:00,1.95,u14702,United Kingdom
541835,581514,22075,24,2011-12-09 11:20:00,0.39,u17754,United Kingdom
541846,581538,23275,1,2011-12-09 11:34:00,1.25,u14446,United Kingdom
541851,581538,22068,1,2011-12-09 11:34:00,0.39,u14446,United Kingdom


In [19]:
# an example of duplicated data
# both rows are identical

online_transactions[(online_transactions.invoice == "536464") &
                   (online_transactions.stock_code == "21992")]

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
520,536464,21992,1,2010-12-01 12:23:00,2.95,u17968,United Kingdom
531,536464,21992,1,2010-12-01 12:23:00,2.95,u17968,United Kingdom


In [20]:
# using describe to look at the distribution of data

online_transactions.describe()

Unnamed: 0,quantity,price
count,541910.0,541910.0
mean,9.552234,4.611138
std,218.080957,96.759765
min,-80995.0,-11062.06
25%,1.0,1.25
50%,3.0,2.08
75%,10.0,4.13
max,80995.0,38970.0


In [21]:
# using describe to look at the distribution of data

stock_description.describe()

# hmm.. AiWah spotted there were 47 cases of ? in description, let's investigate

Unnamed: 0,stock_code,description
count,3952,3952
unique,3905,3785
top,22600,?
freq,2,47


In [22]:
# oh no... we have ?, and 6 stock codes with the same description and other duplicates appearing. This is such a mess!
# we may need to recalculate the stock code

stock_description.description.value_counts()

description
?                                     47
METAL SIGN,CUPCAKE SINGLE HOOK         6
CINAMMON SET OF 9 T-LIGHTS             2
COLUMBIAN CANDLE RECTANGLE             2
3 WHITE CHOC MORRIS BOXED CANDLES      2
                                      ..
MAGNETS PACK OF 4 SWALLOWS             1
MAGNETS PACK OF 4 CHILDHOOD MEMORY     1
MAGNETS PACK OF 4 HOME SWEET HOME      1
MAGNETS PACK OF 4 VINTAGE COLLAGE      1
SAMPLES                                1
Name: count, Length: 3785, dtype: int64

In [23]:
# stock codes where description is ?

stock_description[stock_description.description == "?"]

Unnamed: 0,stock_code,description
36,16020C,?
64,16207B,?
390,21145,?
451,21232,?
535,21368,?
584,21427,?
594,21446,?
675,21591,?
865,21877,?
997,22077,?


In [24]:
# invoices where price is less than 0.. oh! we see we have missing customer ids in this case

online_transactions[online_transactions.price < 0]

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
302331,A563187,B,1,2011-08-12 14:52:00,-11062.06,,United Kingdom
307176,A563186,B,1,2011-08-12 14:51:00,-11062.06,,United Kingdom


In [25]:
# using sql to identify rows where price less than 0

query = """select *
           from bootcamp.online_transactions
          where price < 0
           """

pd.read_sql(query, connect)

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
0,A563186,B,1,2011-08-12 14:51:00,-11062.06,,United Kingdom
1,A563187,B,1,2011-08-12 14:52:00,-11062.06,,United Kingdom


In [26]:
# invoices where quantity is less than 0

online_transactions[online_transactions.quantity < 0]

# we may want to remove rows of data where quantity < 0 in future iterations

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
58,C536391,22556,-12,2010-12-01 10:24:00,1.65,u17548,United Kingdom
59,C536391,21484,-12,2010-12-01 10:24:00,3.45,u17548,United Kingdom
139,C536391,21984,-24,2010-12-01 10:24:00,0.29,u17548,United Kingdom
140,C536391,22557,-12,2010-12-01 10:24:00,1.65,u17548,United Kingdom
894,C536548,22242,-5,2010-12-01 14:33:00,1.65,u12472,Germany
...,...,...,...,...,...,...,...
541453,C581465,23660,-2,2011-12-08 18:59:00,1.65,u15755,United Kingdom
541454,C581466,22838,-1,2011-12-08 19:20:00,14.95,u13883,United Kingdom
541455,C581466,21232,-1,2011-12-08 19:20:00,1.25,u13883,United Kingdom
541468,C581470,23084,-4,2011-12-08 19:28:00,2.08,u17924,United Kingdom


In [27]:
# where one invoice has a negative quantity less than 0

online_transactions.quantity.min()

-80995

In [28]:
# look for missing data in both tables

online_transactions.isnull().sum()

# whoa, what is going on here. we just saw some missing customer ids. Looks like
# when migrating the data to redshift, the missing data was not handled correctly

invoice         0
stock_code      0
quantity        0
invoice_date    0
price           0
customer_id     0
country         0
dtype: int64

In [29]:
# but I can try to fix this by replacing the blanks with python recognised nan

online_transactions = online_transactions.replace('', np.nan)                   # to get rid of empty values

# we will fix this is in our transformation tasks

In [30]:
np.nan

nan

In [31]:
# check for missing values again

online_transactions.isnull().sum()

# 135k missing customer ids!!!! We will drop invoices with missing customer ids in our transformation tasks

invoice              0
stock_code           0
quantity             0
invoice_date         0
price                0
customer_id     135080
country              0
dtype: int64

In [32]:
# percentage of rows without a customer id

round(online_transactions.customer_id.isnull().sum()/online_transactions.shape[0])

0

In [33]:
# look for missing data in stock description table

stock_description.isnull().sum()

# hmm...

stock_code     0
description    0
dtype: int64

In [34]:
# fix missing data

stock_description = stock_description.replace('', np.nan)   
stock_description.isnull().sum()

# ok, cant see any missing data right now, but looks like someone replaced the missing values with ?

stock_code     0
description    0
dtype: int64

In [35]:
# check for the summary of both tables

online_transactions.info()

# invoice_date is appearing as data type object and not date time, everything else looks good

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   invoice       541910 non-null  object 
 1   stock_code    541910 non-null  object 
 2   quantity      541910 non-null  int64  
 3   invoice_date  541910 non-null  object 
 4   price         541910 non-null  float64
 5   customer_id   406830 non-null  object 
 6   country       541910 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 28.9+ MB


In [36]:
stock_description.info()

# loos good, but will double check how much stock codes are in the online_transactions table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3952 entries, 0 to 3951
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   stock_code   3952 non-null   object
 1   description  3952 non-null   object
dtypes: object(2)
memory usage: 61.9+ KB


In [37]:
len(online_transactions.stock_code.unique())

# hmm.. why does online_transactions have more stock codes?

4070

In [38]:
# writing a query to identify how many stock codes 
# have been purchased but are not in the stock description table

query = """select count(distinct t1.stock_code)
           from bootcamp.online_transactions t1
           left join (select *
                      from bootcamp.stock_description
                      where description <> '?') t2 on t1.stock_code = t2.stock_code
           where t2.stock_code is null
           """

pd.read_sql(query, connect)



Unnamed: 0,count
0,165


In [39]:
# viewing the data where stock codes are missing
# in the stock description table

query = """select t1.*,
                  t2.Description
           from bootcamp.online_transactions t1
           left join (select *
                      from bootcamp.stock_description
                      where description <> '?') t2 on t1.stock_code = t2.stock_code
           where t2.stock_code is null
           """

pd.read_sql(query, connect)

# one observation, there is a missing customer id so will filter out the missing customer_id

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536544,21704,1,2010-12-01 14:32:00,1.66,,United Kingdom,
1,536549,85226A,1,2010-12-01 14:34:00,0.00,,United Kingdom,
2,536557,46000S,1,2010-12-01 14:41:00,1.45,u17841,United Kingdom,
3,536569,M,1,2010-12-01 15:35:00,1.25,u16274,United Kingdom,
4,536592,21594,1,2010-12-01 17:06:00,4.21,,United Kingdom,
...,...,...,...,...,...,...,...,...
2374,581405,M,3,2011-12-08 13:50:00,0.42,u13521,United Kingdom,
2375,581406,46000S,300,2011-12-08 13:58:00,0.00,,United Kingdom,
2376,581475,21705,24,2011-12-09 08:39:00,0.39,u13069,United Kingdom,
2377,581492,21703,1,2011-12-09 10:03:00,0.83,,United Kingdom,


In [40]:
# looking at the data where stock codes are missing 
# in the stock description table and customer id is null

query = """select distinct t1.stock_code
           from bootcamp.online_transactions t1
           left join bootcamp.stock_description
                       t2 on t1.stock_code = t2.stock_code
           where t2.stock_code is null
            and customer_id <> ''
           """

pd.read_sql(query, connect)

# one observation, there is a missing customer id so will filter out the missing customer_id
# only 22 stock codes 

Unnamed: 0,stock_code
0,18007
1,46000M
2,BANK CHARGES
3,23444
4,16162M
5,M
6,46000U
7,46000R
8,20964
9,21705


In [41]:
# query that gives us the number of invoices with stock codes
# bank charges, post, d, m, cruk

query = """select t1.stock_code,
                  count(distinct t1.invoice) as number_invoices
           from bootcamp.online_transactions t1
           left join (select *
                     from bootcamp.stock_description
                     where description <> '?') t2 on t1.stock_code = t2.stock_code
           where t1.stock_code in ('BANK CHARGES', 
                                'POST', 'D', 'M', 'CRUK')
           group by t1.stock_code
           order by number_invoices desc
           """

pd.read_sql(query, connect)

Unnamed: 0,stock_code,number_invoices
0,POST,1255
1,M,517
2,D,65
3,BANK CHARGES,36
4,CRUK,16


In [42]:
# number of invoices where price < 0

query = """select count(distinct invoice)
           from bootcamp.online_transactions
           where price = 0 and customer_id <> ''
           """

price_check = pd.read_sql(query, connect)
price_check

Unnamed: 0,count
0,34


In [43]:
# number of invoices where quantity < 0

query = """select count(distinct invoice)
           from bootcamp.online_transactions
           where quantity < 0
           """

pd.read_sql(query, connect)

Unnamed: 0,count
0,5172


In [44]:
connect.close()

Data Quality Audit
- We have 5270 rows of duplicated data and we want to keep only one instance
- 24% of rows do not have a customer id
- We need to fix the invoice_date field
- We need to do further investigation on stock code, but 22 stock codes without customer_id do not have a descriptions.
- We have 5k invoices with a negative quantity, with one invoice having a negative quantity of 81k. 
- There are two invoices with negative price, but they also do not have a customer id.
- We spotted 2155 invoices where price is 0, and only 34 invoices without a customer id that have price 0.
- We have spotted 47 stock codes with the description ?, and also stock codes with the description post (1255 invoices), bank charges..
