We are going to extract data from redshift and carry out the following transformation tasks
- remove all rows where customer id is missing - SQL 
- remove certain stock codes - SQL 
- add description to the online transactions table - SQL 
- remove all duplicate data - Python
- replace missing stock description with Unknown  - SQL or Python
- fix data type - Python


In [1]:
# import the libraries we need

import psycopg2
import pandas as pd

import boto3
from io import StringIO

import warnings
warnings.filterwarnings("ignore")


## Extracting Data

In [2]:
# NEVER share passwords
# this only works with a .env file, you can hardcode the values for now

import os

from dotenv import load_dotenv
load_dotenv()  # only for local testing


dbname = os.getenv("dbname")
host = os.getenv("host")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")

In [3]:
# connect to redshift

def connect_to_redshift(dbname, host, port, user, password):
    """definition to connect to redshift"""

    connect = psycopg2.connect(
            dbname=dbname, host=host, port=port, user=user, password=password
        )

    cursor = connect.cursor()
    
    print("connection to redshift made")
    
    return connect, cursor

In [4]:
connect, cursor = connect_to_redshift(dbname, host, port, user, password)

connection to redshift made


In [5]:
# check everything is working

query = """select *
from bootcamp.stock_description
limit 10
"""

pd.read_sql(query, connect)

Unnamed: 0,stock_code,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE
5,10124G,ARMY CAMO BOOKCOVER TAPE
6,10125,MINI FUNKY DESIGN TAPES
7,10133,COLOURING PENCILS BROWN TUBE
8,10135,COLOURING PENCILS BROWN TUBE
9,11001,ASSTD DESIGN RACING CAR PEN


In [6]:
# extracts data from redshift
# joins the description field to the online transactions table
# removes invoices where customer id is blank
# removes invoices where stock code is in bank charges, postages etc...

query = """
select o.*,
       s.description
from bootcamp.online_transactions o
/*this is a sub query that removes question marks from the stock desc table*/
left join (select *
          from bootcamp.stock_description
          where description <> '?') s on o.stock_code = s.stock_code
where o.customer_id <> ''
    and o.stock_code not in ('BANK CHARGES', 'POST', 'D', 'M', 'CRUK')
"""

online_trans_w_desc = pd.read_sql(query, connect)
online_trans_w_desc.head()

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536365,84029G,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom,KNITTED UNION FLAG HOT WATER BOTTLE
1,536366,22633,6,2010-12-01 08:28:00,1.85,u1785,United Kingdom,HAND WARMER UNION JACK
2,536368,22912,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,YELLOW COAT RACK PARIS FASHION
3,536367,22748,6,2010-12-01 08:34:00,2.1,u13047,United Kingdom,POPPY'S PLAYHOUSE KITCHEN
4,536367,22623,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,BOX OF VINTAGE JIGSAW BLOCKS


In [7]:
online_trans_w_desc[online_trans_w_desc.customer_id == '']

# we have no cases of missing customer ids

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description


In [8]:
type(online_trans_w_desc)

pandas.core.frame.DataFrame

In [9]:
online_trans_w_desc.shape

# dropped from 541k rows to 405k rows, and increased the number of columns by one

(405063, 8)

## Transforming Data

### Removing duplicated data

In [10]:
# task 1 - Remove duplicated data

online_trans_w_desc.duplicated().sum()

5222

In [11]:
online_trans_w_desc[online_trans_w_desc.duplicated()]

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
334,536464,21992,1,2010-12-01 12:23:00,2.95,u17968,United Kingdom,VINTAGE PAISLEY STATIONERY SET
379,536412,22327,1,2010-12-01 11:49:00,2.95,u1792,United Kingdom,ROUND SNACK BOXES SET OF 4 SKULLS
387,536412,22273,1,2010-12-01 11:49:00,2.95,u1792,United Kingdom,FELTCRAFT DOLL MOLLY
392,536412,21708,1,2010-12-01 11:49:00,4.95,u1792,United Kingdom,FOLDING UMBRELLA CREAM POLKADOT
394,536412,85184C,1,2010-12-01 11:49:00,2.95,u1792,United Kingdom,SET 4 VALENTINE DECOUPAGE HEART BOX
...,...,...,...,...,...,...,...,...
404885,581471,21411,2,2011-12-08 19:29:00,1.95,u14702,United Kingdom,GINGHAM HEART DOORSTOP RED
404989,581514,22075,24,2011-12-09 11:20:00,0.39,u17754,United Kingdom,6 RIBBONS ELEGANT CHRISTMAS
405000,581538,23275,1,2011-12-09 11:34:00,1.25,u14446,United Kingdom,SET OF 3 HANGING OWLS OLLIE BEAK
405005,581538,22068,1,2011-12-09 11:34:00,0.39,u14446,United Kingdom,BLACK PIRATE TREASURE CHEST


In [12]:
# code to only keep the first appearance of a duplicated row

online_trans_cleaned = online_trans_w_desc.drop_duplicates(keep = 'first')
online_trans_cleaned.head()

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536365,84029G,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom,KNITTED UNION FLAG HOT WATER BOTTLE
1,536366,22633,6,2010-12-01 08:28:00,1.85,u1785,United Kingdom,HAND WARMER UNION JACK
2,536368,22912,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,YELLOW COAT RACK PARIS FASHION
3,536367,22748,6,2010-12-01 08:34:00,2.1,u13047,United Kingdom,POPPY'S PLAYHOUSE KITCHEN
4,536367,22623,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,BOX OF VINTAGE JIGSAW BLOCKS


In [13]:
# no duplicated rows of data anymore

online_trans_cleaned.duplicated().sum()

0

In [14]:
print(online_trans_w_desc.shape)
print(online_trans_cleaned.shape)

(405063, 8)
(399841, 8)


### Fixing the invoice date data type

In [15]:
online_trans_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 399841 entries, 0 to 405062
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   invoice       399841 non-null  object 
 1   stock_code    399841 non-null  object 
 2   quantity      399841 non-null  int64  
 3   invoice_date  399841 non-null  object 
 4   price         399841 non-null  float64
 5   customer_id   399841 non-null  object 
 6   country       399841 non-null  object 
 7   description   398669 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 27.5+ MB


In [17]:
# transformation task # 2 - fix the invoice data

online_trans_cleaned.invoice_date = pd.to_datetime(online_trans_cleaned.invoice_date)
online_trans_cleaned.invoice_date

0        2010-12-01 08:26:00
1        2010-12-01 08:28:00
2        2010-12-01 08:34:00
3        2010-12-01 08:34:00
4        2010-12-01 08:34:00
                 ...        
405058   2011-12-09 12:49:00
405059   2011-12-09 12:50:00
405060   2011-12-09 12:50:00
405061   2011-12-09 12:50:00
405062   2011-12-09 12:50:00
Name: invoice_date, Length: 399841, dtype: datetime64[ns]

In [18]:
online_trans_cleaned.info()

# the invoice date is now a data type datetime64

<class 'pandas.core.frame.DataFrame'>
Index: 399841 entries, 0 to 405062
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   invoice       399841 non-null  object        
 1   stock_code    399841 non-null  object        
 2   quantity      399841 non-null  int64         
 3   invoice_date  399841 non-null  datetime64[ns]
 4   price         399841 non-null  float64       
 5   customer_id   399841 non-null  object        
 6   country       399841 non-null  object        
 7   description   398669 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 27.5+ MB


In [19]:
online_trans_cleaned.head()

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536365,84029G,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom,KNITTED UNION FLAG HOT WATER BOTTLE
1,536366,22633,6,2010-12-01 08:28:00,1.85,u1785,United Kingdom,HAND WARMER UNION JACK
2,536368,22912,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,YELLOW COAT RACK PARIS FASHION
3,536367,22748,6,2010-12-01 08:34:00,2.1,u13047,United Kingdom,POPPY'S PLAYHOUSE KITCHEN
4,536367,22623,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,BOX OF VINTAGE JIGSAW BLOCKS


### Replacing missing description values with Unknown 

In [20]:
online_trans_cleaned.isna().sum()

invoice            0
stock_code         0
quantity           0
invoice_date       0
price              0
customer_id        0
country            0
description     1172
dtype: int64

In [21]:
# replace missing descriptions with Unknown

online_trans_cleaned.description.fillna("Unknown", inplace = True)

In [22]:
# check there are no missing descriptions

online_trans_cleaned.isnull().sum()

invoice         0
stock_code      0
quantity        0
invoice_date    0
price           0
customer_id     0
country         0
description     0
dtype: int64

In [29]:
online_trans_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 399841 entries, 0 to 405062
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   invoice       399841 non-null  object        
 1   stock_code    399841 non-null  object        
 2   quantity      399841 non-null  int64         
 3   invoice_date  399841 non-null  datetime64[ns]
 4   price         399841 non-null  float64       
 5   customer_id   399841 non-null  object        
 6   country       399841 non-null  object        
 7   description   399841 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 27.5+ MB


In [30]:
online_trans_cleaned.shape

(399841, 8)

## Loading Data to local data folder 

In [28]:
# make sure all transformations are in place
# anything else?!

online_trans_cleaned.to_pickle("../data/online_transactions_cleaned.pkl")