In [52]:
import psycopg2
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [5]:
# code to import variables from .env file 
# I am doing this to protect my passwords, you all need to hard code these variables for now


import os
from dotenv import load_dotenv
load_dotenv()  

dbname = os.getenv("dbname")
host = os.getenv("host")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")


In [6]:
# connect to redshift
connect = psycopg2.connect(dbname=dbname, host=host, port=port, user=user, password=password)
connect

<connection object at 0x12f1fa650; dsn: 'user=admin password=xxx dbname=dev host=redshift-cluster-1.c9gt5btzchps.eu-central-1.redshift.amazonaws.com port=5439', closed: 0>

In [7]:
# write the query to make sure everything is working ok

query = """select *
           from bootcamp.stock_description
           limit 10"""

pd.read_sql(query, connect)

  pd.read_sql(query, connect)


Unnamed: 0,stock_code,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE
5,10124G,ARMY CAMO BOOKCOVER TAPE
6,10125,MINI FUNKY DESIGN TAPES
7,10133,COLOURING PENCILS BROWN TUBE
8,10135,COLOURING PENCILS BROWN TUBE
9,11001,ASSTD DESIGN RACING CAR PEN


In [8]:
# write a query that reads all the rows from the online transactions table and stores as a variable

query = """select *
           from bootcamp.online_transactions
           """

online_transactions = pd.read_sql(query, connect)

  online_transactions = pd.read_sql(query, connect)


In [9]:
online_transactions.shape

(541910, 7)

In [10]:
online_transactions.head()

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
0,536365,85123A,6,2010-12-01 08:26:00,2.55,u1785,United Kingdom
1,536368,22914,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom
2,536367,48187,4,2010-12-01 08:34:00,7.95,u13047,United Kingdom
3,536370,22726,12,2010-12-01 08:45:00,3.75,u12583,France
4,536375,82482,6,2010-12-01 09:32:00,2.1,u1785,United Kingdom


In [11]:
# write a query that reads all the rows from the online transactions table and stores as a variable

query = """select *
           from bootcamp.stock_description
           """

stock_description = pd.read_sql(query, connect)

  stock_description = pd.read_sql(query, connect)


In [12]:
stock_description.shape

(3952, 2)

In [13]:
stock_description.head()

Unnamed: 0,stock_code,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE


In [47]:
# data quality activity - identify duplicated rows of data

online_transactions[online_transactions.duplicated()]

# we can either drop all the duplicated data or keep only one

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
59,536464,21992,1,2010-12-01 12:23:00,2.95,u17968,United Kingdom
164,536528,84985A,1,2010-12-01 13:17:00,1.45,u15525,United Kingdom
326,536412,21448,1,2010-12-01 11:49:00,1.65,u1792,United Kingdom
813,536412,22141,1,2010-12-01 11:49:00,2.10,u1792,United Kingdom
1199,536412,21706,1,2010-12-01 11:49:00,4.95,u1792,United Kingdom
...,...,...,...,...,...,...,...
541604,581253,23427,1,2011-12-08 11:15:00,12.50,u16891,United Kingdom
541687,581414,23291,1,2011-12-08 14:39:00,1.25,u1473,United Kingdom
541770,581450,22118,1,2011-12-08 17:54:00,1.25,u16794,United Kingdom
541790,581471,21411,2,2011-12-08 19:29:00,1.95,u14702,United Kingdom


In [46]:
# number of duplicated rows - this count does not include the first occurance of the row of data

online_transactions.duplicated().sum()

5270

In [48]:
# an example of duplicated data
# both rows are identical

online_transactions[(online_transactions.invoice == "536464") &
                   (online_transactions.stock_code == "21992")]

# the data frame above only shows the duplicates and not the first entry as well

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
54,536464,21992,1,2010-12-01 12:23:00,2.95,u17968,United Kingdom
59,536464,21992,1,2010-12-01 12:23:00,2.95,u17968,United Kingdom


In [49]:
online_transactions[(online_transactions.invoice == "536412") &
                   (online_transactions.stock_code == "21448")]

# aah, but is this example we can see that we have duplicated rows of data and also the quantity is split out
# as a group, we agreed to drop all the duplicated rows and sum the quantity

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
37,536412,21448,2,2010-12-01 11:49:00,1.65,u1792,United Kingdom
325,536412,21448,1,2010-12-01 11:49:00,1.65,u1792,United Kingdom
326,536412,21448,1,2010-12-01 11:49:00,1.65,u1792,United Kingdom
2061,536412,21448,2,2010-12-01 11:49:00,1.65,u1792,United Kingdom
2063,536412,21448,2,2010-12-01 11:49:00,1.65,u1792,United Kingdom


In [50]:
# data quality activity - identify missing data

online_transactions.isnull().sum()

# whoa, what is going on here. we just saw some missing customer ids. Looks like
# when migrating the data to redshift, the missing data was not handled correctly

invoice         0
stock_code      0
quantity        0
invoice_date    0
price           0
customer_id     0
country         0
dtype: int64

In [53]:
# but I can try to fix this by replacing the blanks with python recognised nan

online_transactions = online_transactions.replace('', np.nan)                   # to get rid of empty values


In [54]:
np.nan

nan

In [55]:

online_transactions.isnull().sum()

# 135k missing customer ids!!!! We will drop invoices with missing customer ids in our transformation tasks

invoice              0
stock_code           0
quantity             0
invoice_date         0
price                0
customer_id     135080
country              0
dtype: int64

In [59]:
# approx. 25% of the data does not have a customer id
online_transactions.customer_id.isnull().sum()/online_transactions.shape[0]*100

24.92664833643963

In [60]:
# look for missing data in stock description table

stock_description.isnull().sum()

# hmm...

stock_code     0
description    0
dtype: int64

In [61]:
# fix missing data

stock_description = stock_description.replace('', np.nan)   
stock_description.isnull().sum()

# ok, cant see any missing data right now, but..

stock_code     0
description    0
dtype: int64

In [62]:
# using describe to look at the distribution of data

stock_description.describe()

#  based on what we explored looks like someone replaced the missing values with ?

Unnamed: 0,stock_code,description
count,3952,3952
unique,3905,3785
top,22600,?
freq,2,47


In [63]:
# oh no... we have ?, and 6 stock codes with the same description and other duplicates appearing. This is such a mess!
# we may need to recalculate the stock code

stock_description.description.value_counts()

description
?                                     47
METAL SIGN,CUPCAKE SINGLE HOOK         6
CINAMMON SET OF 9 T-LIGHTS             2
COLUMBIAN CANDLE RECTANGLE             2
3 WHITE CHOC MORRIS BOXED CANDLES      2
                                      ..
MAGNETS PACK OF 4 SWALLOWS             1
MAGNETS PACK OF 4 CHILDHOOD MEMORY     1
MAGNETS PACK OF 4 HOME SWEET HOME      1
MAGNETS PACK OF 4 VINTAGE COLLAGE      1
SAMPLES                                1
Name: count, Length: 3785, dtype: int64

In [64]:
# stock codes where description is ?

stock_description[stock_description.description == "?"]

Unnamed: 0,stock_code,description
36,16020C,?
64,16207B,?
390,21145,?
451,21232,?
535,21368,?
584,21427,?
594,21446,?
675,21591,?
865,21877,?
997,22077,?


In [14]:
# describe function shows we have negative price and quantity

online_transactions.describe()

Unnamed: 0,quantity,price
count,541910.0,541910.0
mean,9.552234,4.611138
std,218.080957,96.759765
min,-80995.0,-11062.06
25%,1.0,1.25
50%,3.0,2.08
75%,10.0,4.13
max,80995.0,38970.0


In [65]:
# invoices where price is less than 0.. oh! we see we have missing customer ids in this case

online_transactions[online_transactions.price < 0]

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
203327,A563187,B,1,2011-08-12 14:52:00,-11062.06,,United Kingdom
205118,A563186,B,1,2011-08-12 14:51:00,-11062.06,,United Kingdom


In [15]:
# can do the same using sql

query = """select *
           from bootcamp.online_transactions
           where price < 0
           """

pd.read_sql(query, connect)

  pd.read_sql(query, connect)


Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
0,A563187,B,1,2011-08-12 14:52:00,-11062.06,,United Kingdom
1,A563186,B,1,2011-08-12 14:51:00,-11062.06,,United Kingdom


In [19]:
query = """select count(*)
           from bootcamp.online_transactions
           where quantity < 0
           """

pd.read_sql(query, connect)


  pd.read_sql(query, connect)


Unnamed: 0,count
0,10624


In [66]:
query = """select *
           from bootcamp.online_transactions
           where quantity < 0
           """

pd.read_sql(query, connect)

# as a group we agreed not to drop these invoicdes as could be valuaable data to analyse 

  pd.read_sql(query, connect)


Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
0,C536391,21984,-24,2010-12-01 10:24:00,0.29,u17548,United Kingdom
1,C536543,22355,-2,2010-12-01 14:30:00,0.85,u17841,United Kingdom
2,C536757,84347,-9360,2010-12-02 14:23:00,0.03,u15838,United Kingdom
3,C536760,22175,-1,2010-12-02 14:29:00,2.95,u17547,United Kingdom
4,C536812,22574,-192,2010-12-02 16:58:00,0.72,u16546,United Kingdom
...,...,...,...,...,...,...,...
10619,C581235,22840,-2,2011-12-08 10:34:00,7.95,u1595,United Kingdom
10620,C581316,21531,-1,2011-12-08 11:46:00,2.55,u12523,France
10621,C581384,51008,-2,2011-12-08 13:06:00,3.45,u17673,United Kingdom
10622,C581390,23374,-10,2011-12-08 13:13:00,0.82,u13081,United Kingdom


In [67]:
# check for the summary of both tables

online_transactions.info()

# invoice_date is appearing as data type object and not date time, everything else looks good

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   invoice       541910 non-null  object 
 1   stock_code    541910 non-null  object 
 2   quantity      541910 non-null  int64  
 3   invoice_date  541910 non-null  object 
 4   price         541910 non-null  float64
 5   customer_id   406830 non-null  object 
 6   country       541910 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 28.9+ MB


In [68]:
stock_description.info()

# loos good, but will double check how much stock codes are in the online_transactions table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3952 entries, 0 to 3951
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   stock_code   3952 non-null   object
 1   description  3952 non-null   object
dtypes: object(2)
memory usage: 61.9+ KB


In [70]:
len(stock_description.stock_code.unique())


3905

In [71]:
len(online_transactions.stock_code.unique())

# hmm.. why does online_transactions have more stock codes?

4070

In [72]:
# writing a query to identify how many stock codes 
# have been purchased but are not in the stock description table

query = """select count(distinct t1.stock_code)
           from bootcamp.online_transactions t1
           left join (select *
                      from bootcamp.stock_description
                      where description <> '?') t2 on t1.stock_code = t2.stock_code
           where t2.stock_code is null
           """

pd.read_sql(query, connect)

  pd.read_sql(query, connect)


Unnamed: 0,count
0,165


In [73]:
# viewing the data where stock codes are missing
# in the stock description table

query = """select t1.*,
                  t2.Description
           from bootcamp.online_transactions t1
           left join (select *
                      from bootcamp.stock_description
                      where description <> '?') t2 on t1.stock_code = t2.stock_code
           where t2.stock_code is null
           """

pd.read_sql(query, connect)

# one observation, there is a missing customer id so will filter out the missing customer_id

  pd.read_sql(query, connect)


Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536592,21705,2,2010-12-01 17:06:00,3.36,,United Kingdom,
1,536408,21705,12,2010-12-01 11:41:00,1.65,u14307,United Kingdom,
2,536595,21705,5,2010-12-01 17:24:00,1.65,u13576,United Kingdom,
3,536945,84247K,1,2010-12-03 12:24:00,2.95,u14083,United Kingdom,
4,C536379,D,-1,2010-12-01 09:41:00,27.50,u14527,United Kingdom,
...,...,...,...,...,...,...,...,...
2374,581439,21705,1,2011-12-08 16:30:00,3.29,,United Kingdom,
2375,581514,21705,84,2011-12-09 11:20:00,0.39,u17754,United Kingdom,
2376,581439,21704,3,2011-12-08 16:30:00,1.63,,United Kingdom,
2377,581469,21704,1,2011-12-08 19:28:00,0.85,u14606,United Kingdom,


In [74]:
# looking at the data where stock codes are missing 
# in the stock description table and customer id is null

query = """select distinct t1.stock_code
           from bootcamp.online_transactions t1
           left join bootcamp.stock_description
                       t2 on t1.stock_code = t2.stock_code
           where t2.stock_code is null
            and customer_id <> ''
           """

pd.read_sql(query, connect)

# one observation, there is a missing customer id so will filter out the missing customer_id
# only 22 stock codes 

  pd.read_sql(query, connect)


Unnamed: 0,stock_code
0,21705
1,84247K
2,D
3,21704
4,46000S
5,22889
6,16151A
7,CRUK
8,23702
9,46000R


In [75]:
# hmm, we can see we have stock codes that do not look like stock codes

# query that gives us the number of invoices with stock codes
# bank charges, post, d, m, cruk

query = """select t1.stock_code,
                  count(distinct t1.invoice) as number_invoices
           from bootcamp.online_transactions t1
           left join (select *
                     from bootcamp.stock_description
                     where description <> '?') t2 on t1.stock_code = t2.stock_code
           where t1.stock_code in ('BANK CHARGES', 
                                'POST', 'D', 'M', 'CRUK')
           group by t1.stock_code
           order by number_invoices desc
           """

pd.read_sql(query, connect)

  pd.read_sql(query, connect)


Unnamed: 0,stock_code,number_invoices
0,POST,1255
1,M,517
2,D,65
3,BANK CHARGES,36
4,CRUK,16


In [76]:
# number of invoices where price < 0 and customer id is missing

query = """select count(distinct invoice)
           from bootcamp.online_transactions
           where price = 0 and customer_id <> ''
           """

price_check = pd.read_sql(query, connect)
price_check

  price_check = pd.read_sql(query, connect)


Unnamed: 0,count
0,34


In [77]:
query = """select *
           from bootcamp.online_transactions
           where price = 0 and customer_id <> ''
           """

price_check = pd.read_sql(query, connect)
price_check

  price_check = pd.read_sql(query, connect)


Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
0,540372,22553,24,2011-01-06 16:41:00,0.0,u13081,United Kingdom
1,539722,22423,10,2010-12-21 13:45:00,0.0,u14911,EIRE
2,550188,22636,1,2011-04-14 18:57:00,0.0,u12457,Switzerland
3,543599,84535B,16,2011-02-10 13:08:00,0.0,u1756,United Kingdom
4,574175,22065,12,2011-11-03 11:47:00,0.0,u1411,United Kingdom
5,548318,22055,5,2011-03-30 12:45:00,0.0,u13113,United Kingdom
6,553000,47566,4,2011-05-12 15:21:00,0.0,u17667,United Kingdom
7,539263,22580,4,2010-12-16 14:36:00,0.0,u1656,United Kingdom
8,564651,23268,192,2011-08-26 14:19:00,0.0,u14646,Netherlands
9,547417,22062,36,2011-03-23 10:25:00,0.0,u13239,United Kingdom


Data Quality Audit

- We have 5270 rows of duplicated data which we will drop. We also observed cases where we had the same invoice, stock code, date but different quantities. We will fix this by summing the quantity across all invoices and stock code
- 24% of rows do not have a customer id
- We need to fix the invoice_date field
- We need to do further investigation on stock code, but 22 stock codes without customer_id do not have a descriptions.
- We have 5k invoices with a negative quantity, with one invoice having a negative quantity of 81k.
- There are two invoices with negative price, but they also do not have a customer id.
- We spotted 2155 invoices where price is 0, and only 34 invoices without a customer id that have price 0. What should we do about this?
- We have spotted 47 stock codes with the description ?, and also stock codes with the description post (1255 invoices), bank charges..