We are going to extract data from redshift and carry out the following transformation tasks
- remove all rows where customer id is missing - SQL 
- remove certain stock codes - SQL 
- add description to the online transactions table - SQL 
- remove all duplicate data - Python
- replace missing stock description with Unknown  - SQL 
- fix data type - SQL


In [1]:
# import the libraries we need

import psycopg2
import pandas as pd

#import boto3
#from io import StringIO

import warnings
warnings.filterwarnings("ignore")


## Extracting Data

In [4]:
# NEVER share passwords
# please dont copy this code
import os

from dotenv import load_dotenv
load_dotenv()  # only for local testing

dbname = os.getenv("dbname")
host = os.getenv("host")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")

In [5]:
# connect to redshift

def connect_to_redshift(dbname, host, port, user, password):
    """Method that connects to redshift. This gives a warning so will look for another solution"""

    connect = psycopg2.connect(
        dbname=dbname, host=host, port=port, user=user, password=password
    )

    print("Wahoo! connection to redshift made")

    return connect

In [6]:
connect = connect_to_redshift(dbname, host, port, user, password)

Wahoo! connection to redshift made


In [7]:
# check everything is working

query = """select *
from bootcamp.stock_description
limit 10
"""

pd.read_sql(query, connect)

Unnamed: 0,stock_code,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE
5,10124G,ARMY CAMO BOOKCOVER TAPE
6,10125,MINI FUNKY DESIGN TAPES
7,10133,COLOURING PENCILS BROWN TUBE
8,10135,COLOURING PENCILS BROWN TUBE
9,11001,ASSTD DESIGN RACING CAR PEN


In [10]:
query = """
select o.*,
       s.description
from bootcamp.online_transactions o
left join (select *
           from bootcamp.stock_description
           where description <> '?') s on o.stock_code = s.stock_code
"""

online_trans_w_desc = pd.read_sql(query, connect)
online_trans_w_desc.shape

(541910, 8)

In [9]:
query = """
select count(*) as number_rows
from bootcamp.online_transactions o
"""

pd.read_sql(query, connect)


Unnamed: 0,number_rows
0,541910


In [88]:
# extracts data from redshift
# joins the description field to the online transactions table
# removes invoices where customer id is blank
# removes invoices where stock code is in bank charges, post etc...
# converts the invoice date to a datetime field
# creates a variable called total order value
# replaces missing descriptions with unknown

query = """
SELECT ot.invoice, 
       ot.stock_code,
       CASE WHEN s.description IS NULL THEN 'Unknown'
            ELSE s.description END AS description,
       ot.price,
       ot.quantity,
       /*add a variable that gives the total order value*/
       ot.price*ot.quantity as total_order_value,
       CAST(invoice_date As DateTime) AS invoice_date,
       ot.customer_id,
       ot.country
FROM bootcamp.online_transactions ot
/* this is a subquery that removes '?' from the stock_description table */
LEFT JOIN (SELECT *
           FROM bootcamp.stock_description
           WHERE description <> '?') AS s ON ot.stock_code = s.stock_code
WHERE ot.customer_id <> ''
  AND ot.stock_code NOT IN ('BANK CHARGES', 'POST', 'D', 'M', 'CRUK')
"""

online_trans_w_desc = pd.read_sql(query, connect)
online_trans_w_desc.shape

(405063, 9)

In [89]:
online_trans_w_desc.head()

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
0,536365,85123A,CREAM HANGING HEART T-LIGHT HOLDER,2.55,6,15.3,2010-12-01 08:26:00,u1785,United Kingdom
1,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,6,20.34,2010-12-01 08:26:00,u1785,United Kingdom
2,536366,22632,HAND WARMER RED POLKA DOT,1.85,6,11.1,2010-12-01 08:28:00,u1785,United Kingdom
3,536368,22914,BLUE COAT RACK PARIS FASHION,4.95,3,14.85,2010-12-01 08:34:00,u13047,United Kingdom
4,536367,22749,FELTCRAFT PRINCESS CHARLOTTE DOLL,3.75,8,30.0,2010-12-01 08:34:00,u13047,United Kingdom


In [90]:
online_trans_w_desc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405063 entries, 0 to 405062
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   invoice            405063 non-null  object        
 1   stock_code         405063 non-null  object        
 2   description        405063 non-null  object        
 3   price              405063 non-null  float64       
 4   quantity           405063 non-null  int64         
 5   total_order_value  405063 non-null  float64       
 6   invoice_date       405063 non-null  datetime64[ns]
 7   customer_id        405063 non-null  object        
 8   country            405063 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 27.8+ MB


In [91]:
online_trans_w_desc[online_trans_w_desc.customer_id == '']

# we have no cases of missing customer ids

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country


In [92]:
online_trans_w_desc[online_trans_w_desc.description == "Unknown"]

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
515,536557,22686,Unknown,1.25,1,1.25,2010-12-01 14:41:00,u17841,United Kingdom
910,536408,21705,Unknown,1.65,12,19.80,2010-12-01 11:41:00,u14307,United Kingdom
1011,536595,21705,Unknown,1.65,5,8.25,2010-12-01 17:24:00,u13576,United Kingdom
1804,536597,21704,Unknown,0.85,1,0.85,2010-12-01 17:35:00,u18011,United Kingdom
2337,536500,46000M,Unknown,1.55,10,15.50,2010-12-01 12:35:00,u17377,United Kingdom
...,...,...,...,...,...,...,...,...,...
403956,581133,21705,Unknown,0.39,24,9.36,2011-12-07 12:55:00,u14904,United Kingdom
404283,581221,23444,Unknown,15.00,1,15.00,2011-12-08 09:40:00,u17856,United Kingdom
404361,581266,21703,Unknown,0.42,12,5.04,2011-12-08 11:25:00,u12621,Germany
404878,581469,21704,Unknown,0.85,1,0.85,2011-12-08 19:28:00,u14606,United Kingdom


In [93]:
type(online_trans_w_desc)

pandas.core.frame.DataFrame

In [94]:
online_trans_w_desc.shape

# dropped from 541k rows to 405k rows, and increased the number of columns by one

(405063, 9)

## Transforming Data

### Removing duplicated data

In [95]:
# task 1 - Remove duplicated data

online_trans_w_desc.duplicated().sum()

5222

In [96]:
online_trans_w_desc[online_trans_w_desc.duplicated()]

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
279,536464,21992,VINTAGE PAISLEY STATIONERY SET,2.95,1,2.95,2010-12-01 12:23:00,u17968,United Kingdom
359,536528,84985A,SET OF 72 GREEN PAPER DOILIES,1.45,1,1.45,2010-12-01 13:17:00,u15525,United Kingdom
716,536409,22866,HAND WARMER SCOTTY DOG DESIGN,2.10,1,2.10,2010-12-01 11:45:00,u17908,United Kingdom
794,536409,22111,SCOTTIE DOG HOT WATER BOTTLE,4.95,1,4.95,2010-12-01 11:45:00,u17908,United Kingdom
798,536412,22327,ROUND SNACK BOXES SET OF 4 SKULLS,2.95,1,2.95,2010-12-01 11:49:00,u1792,United Kingdom
...,...,...,...,...,...,...,...,...,...
404885,581471,21411,GINGHAM HEART DOORSTOP RED,1.95,2,3.90,2011-12-08 19:29:00,u14702,United Kingdom
404989,581514,22075,6 RIBBONS ELEGANT CHRISTMAS,0.39,24,9.36,2011-12-09 11:20:00,u17754,United Kingdom
405000,581538,23275,SET OF 3 HANGING OWLS OLLIE BEAK,1.25,1,1.25,2011-12-09 11:34:00,u14446,United Kingdom
405005,581538,22068,BLACK PIRATE TREASURE CHEST,0.39,1,0.39,2011-12-09 11:34:00,u14446,United Kingdom


In [102]:
online_trans_w_desc[(online_trans_w_desc.stock_code == "84985A") & 
                    (online_trans_w_desc.invoice == "536528")]

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
348,536528,84985A,SET OF 72 GREEN PAPER DOILIES,1.45,1,1.45,2010-12-01 13:17:00,u15525,United Kingdom
359,536528,84985A,SET OF 72 GREEN PAPER DOILIES,1.45,1,1.45,2010-12-01 13:17:00,u15525,United Kingdom


In [98]:
# code to only keep the first appearance of a duplicated row

online_trans_cleaned = online_trans_w_desc.drop_duplicates(keep = 'first')
online_trans_cleaned.head()

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
0,536365,85123A,CREAM HANGING HEART T-LIGHT HOLDER,2.55,6,15.3,2010-12-01 08:26:00,u1785,United Kingdom
1,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,6,20.34,2010-12-01 08:26:00,u1785,United Kingdom
2,536366,22632,HAND WARMER RED POLKA DOT,1.85,6,11.1,2010-12-01 08:28:00,u1785,United Kingdom
3,536368,22914,BLUE COAT RACK PARIS FASHION,4.95,3,14.85,2010-12-01 08:34:00,u13047,United Kingdom
4,536367,22749,FELTCRAFT PRINCESS CHARLOTTE DOLL,3.75,8,30.0,2010-12-01 08:34:00,u13047,United Kingdom


In [103]:
online_trans_cleaned[(online_trans_cleaned.stock_code == "84985A")&
                     (online_trans_cleaned.invoice == "536528")]

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
348,536528,84985A,SET OF 72 GREEN PAPER DOILIES,1.45,1,1.45,2010-12-01 13:17:00,u15525,United Kingdom


In [104]:
# no duplicated rows of data anymore

online_trans_cleaned.duplicated().sum()

0

In [105]:
print(online_trans_w_desc.shape)
print(online_trans_cleaned.shape)

(405063, 9)
(399841, 9)


In [74]:
online_trans_w_desc.shape[0] - online_trans_cleaned.shape[0]

5222

In [125]:
def identify_and_remove_duplicated_data(df):
    """Function that removes identifies and removes duplicates"""
    
    if df.duplicated().sum() > 0:
        # identify duplicated data
        print("# of duplicated rows:", df.duplicated().sum())

        # drop the duplicated rows, and only keep first apperance
        df_cleaned = df.drop_duplicates(keep = 'first')
        
        print("-"*60)
        print("shape of data before removing duplicated data", df.shape)
        print("shape of data after removing duplicated data", df_cleaned.shape)
        print("-"*60)
    else:
        print("No duplicated data found")
        df_cleaned = df
        
    
    return df_cleaned

In [126]:
online_trans_cleaned = identify_and_remove_duplicated_data(online_trans_w_desc)

# of duplicated rows: 5222
------------------------------------------------------------
shape of data before removing duplicated data (405063, 9)
shape of data after removing duplicated data (399841, 9)
------------------------------------------------------------


In [127]:
online_trans_cleaned.head()

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
0,536365,85123A,CREAM HANGING HEART T-LIGHT HOLDER,2.55,6,15.3,2010-12-01 08:26:00,u1785,United Kingdom
1,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,6,20.34,2010-12-01 08:26:00,u1785,United Kingdom
2,536366,22632,HAND WARMER RED POLKA DOT,1.85,6,11.1,2010-12-01 08:28:00,u1785,United Kingdom
3,536368,22914,BLUE COAT RACK PARIS FASHION,4.95,3,14.85,2010-12-01 08:34:00,u13047,United Kingdom
4,536367,22749,FELTCRAFT PRINCESS CHARLOTTE DOLL,3.75,8,30.0,2010-12-01 08:34:00,u13047,United Kingdom


In [129]:
online_trans_cleaned = identify_and_remove_duplicated_data(online_trans_cleaned)

No duplicated data found


## Loading Data to local data folder 

In [135]:
# make sure all transformations are in place
# anything else?!

online_trans_cleaned.to_pickle("../data/online_transactions_cleaned.pkl")

In [136]:
test = pd.read_pickle("../data/online_transactions_cleaned.pkl")
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 399841 entries, 0 to 405062
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   invoice            399841 non-null  object        
 1   stock_code         399841 non-null  object        
 2   description        399841 non-null  object        
 3   price              399841 non-null  float64       
 4   quantity           399841 non-null  int64         
 5   total_order_value  399841 non-null  float64       
 6   invoice_date       399841 non-null  datetime64[ns]
 7   customer_id        399841 non-null  object        
 8   country            399841 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 30.5+ MB


In [140]:
test.head()

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
0,536365,85123A,CREAM HANGING HEART T-LIGHT HOLDER,2.55,6,15.3,2010-12-01 08:26:00,u1785,United Kingdom
1,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,6,20.34,2010-12-01 08:26:00,u1785,United Kingdom
2,536366,22632,HAND WARMER RED POLKA DOT,1.85,6,11.1,2010-12-01 08:28:00,u1785,United Kingdom
3,536368,22914,BLUE COAT RACK PARIS FASHION,4.95,3,14.85,2010-12-01 08:34:00,u13047,United Kingdom
4,536367,22749,FELTCRAFT PRINCESS CHARLOTTE DOLL,3.75,8,30.0,2010-12-01 08:34:00,u13047,United Kingdom


In [137]:
test.shape

(399841, 9)

In [138]:
demo = online_trans_cleaned.to_csv("../data/online_transactions_cleaned.csv", index = False)
test = pd.read_csv("../data/online_transactions_cleaned.csv")

In [139]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399841 entries, 0 to 399840
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   invoice            399841 non-null  object 
 1   stock_code         399841 non-null  object 
 2   description        399841 non-null  object 
 3   price              399841 non-null  float64
 4   quantity           399841 non-null  int64  
 5   total_order_value  399841 non-null  float64
 6   invoice_date       399841 non-null  object 
 7   customer_id        399841 non-null  object 
 8   country            399841 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 27.5+ MB


## Loading Data to s3 - we ll pick up on Friday

In [143]:
# load the online_transactions_final data frame to s3

# connect to s3

AWS_ACCESS_KEY_ID = 
AWS_SECRET_ACCESS_KEY = 
AWS_S3_BUCKET = "waia-data-dump"


<botocore.client.S3 at 0x185b11390>

In [147]:
import boto3
from io import StringIO

def connect_to_s3(aws_access_key_id, aws_secret_access_key):
    """Methods that connects to s3"""

    s3_client = boto3.client(
        "s3",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )

    print("Connection to s3 made")
    
    return s3_client


def df_to_s3(df, key, s3_bucket, aws_access_key_id, aws_secret_access_key):
    """Function that writes a data frame as a .csv file to a s3 bucket"""

    csv_buffer = StringIO()  # create buffer to temporarily store the Data Frame

    df.to_csv(csv_buffer, index=False)  # code to write the data frame as csv file

    s3_client = connect_to_s3(aws_access_key_id, aws_secret_access_key)

    s3_client.put_object(
        Bucket=s3_bucket, Key=key, Body=csv_buffer.getvalue()
    )  # this code writes the temp stored csv file and writes to s3

    print(f"The transformed data is saved as CSV in the following location s3://{s3_bucket}/{key}")

In [148]:
connect_to_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

Connection to s3 made


<botocore.client.S3 at 0x190cbc6d0>

In [149]:
key = "bootcamp2/transformations/sh_online_trans_cleaned.csv"

df_to_s3(online_trans_cleaned, key, AWS_S3_BUCKET, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

Connection to s3 made
The transformed data is saved as CSV in the following location s3://waia-data-dump/bootcamp2/transformations/sh_online_trans_cleaned.csv
