- (Y) remove all rows where customer id is missing - **SQL** or Python
- (Y) remove certain stock codes: bank charges, post, d, m cruk - **SQL** or Python
- (Y) add description to the online transactions table - **SQL** or Python
- remove all duplicate data - Python
- (Y) replace missing stock description with Unknown - **SQL** or Python
- (Y) fix data type for the invoice date column - Python or **SQL**

In [78]:
# import all the libraries we need

import psycopg2
import pandas as pd

import boto3
from io import StringIO, BytesIO

# add if you want to remove warning messages
import warnings
warnings.filterwarnings("ignore")

In [84]:
# you do not need to import these libraries - you just hardcode the variables

import os

from dotenv import load_dotenv
load_dotenv()  # only for local testing


dbname = os.getenv("dbname")
host = os.getenv("host")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")

In [85]:
# connect to redshift
# reference for the connect function: https://www.psycopg.org/docs/module.html

def connect_to_redshift(dbname, host, port, user, password):

    connect = psycopg2.connect(
                dbname=dbname, 
                host=host,
                port=port, 
                user=user, 
                password=password
            )
    
    print("Connection was successful")

    return connect
    

In [86]:
connect = connect_to_redshift(dbname, host, port, user, password)

Connection was successful


In [87]:
# check everything is working

query = """
select *
from bootcamp.stock_description
limit 10
"""

pd.read_sql(query, connect)

Unnamed: 0,stock_code,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE
5,10124G,ARMY CAMO BOOKCOVER TAPE
6,10125,MINI FUNKY DESIGN TAPES
7,10133,COLOURING PENCILS BROWN TUBE
8,10135,COLOURING PENCILS BROWN TUBE
9,11001,ASSTD DESIGN RACING CAR PEN


In [88]:
# write the sql query that carries out

query = """
select ot.*,
       sd.description
from bootcamp.online_transactions ot
left join (select *
           from bootcamp.stock_description
           where description <> '?') sd on ot.stock_code = sd.stock_code
where ot.customer_id <> ''
    and ot.stock_code not in ('D', 'M', 'CRUK', 'POST', 'BANK CHARGES')
"""

online_trans = pd.read_sql(query, connect)

In [89]:
online_trans.shape

(405063, 8)

In [90]:
online_trans[online_trans.description.isnull()]

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
147,536557,22686,1,2010-12-01 14:41:00,1.25,u17841,United Kingdom,
350,536597,21703,4,2010-12-01 17:35:00,0.42,u18011,United Kingdom,
400,536736,21703,192,2010-12-02 12:59:00,0.42,u17381,United Kingdom,
452,C536820,22878,-1,2010-12-02 17:14:00,2.10,u18168,United Kingdom,
649,537023,22686,1,2010-12-03 16:02:00,1.25,u16725,United Kingdom,
...,...,...,...,...,...,...,...,...
404663,581123,21705,24,2011-12-07 12:34:00,0.39,u1575,United Kingdom,
404678,581133,21705,24,2011-12-07 12:55:00,0.39,u14904,United Kingdom,
404787,581514,21705,84,2011-12-09 11:20:00,0.39,u17754,United Kingdom,
405011,581469,21704,1,2011-12-08 19:28:00,0.85,u14606,United Kingdom,


In [91]:
online_trans.description.value_counts(dropna=False).nlargest(20)

description
CREAM HANGING HEART T-LIGHT HOLDER    2077
REGENCY CAKESTAND 3 TIER              1905
JUMBO BAG RED RETROSPOT               1662
ASSORTED COLOUR BIRD ORNAMENT         1418
PARTY BUNTING                         1416
LUNCH BAG RED RETROSPOT               1359
SET OF 3 CAKE TINS PANTRY DESIGN      1232
None                                  1175
LUNCH BAG  BLACK SKULL.               1126
POPCORN HOLDER                        1118
JUMBO BAG VINTAGE DOILEY              1115
LUNCH BAG SUKI DESIGN                 1103
PACK OF 72 RETROSPOT CAKE CASES       1080
LUNCH BAG VINTAGE DOILEY              1040
BUNTING , SPOTTY                      1036
PAPER CHAIN KIT 50'S CHRISTMAS        1029
LUNCH BAG SPACEBOY DESIGN             1021
LUNCH BAG CARS BLUE                   1012
NATURAL SLATE HEART CHALKBOARD         997
HEART OF WICKER SMALL                  996
Name: count, dtype: int64

In [92]:
online_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405063 entries, 0 to 405062
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   invoice       405063 non-null  object 
 1   stock_code    405063 non-null  object 
 2   quantity      405063 non-null  int64  
 3   invoice_date  405063 non-null  object 
 4   price         405063 non-null  float64
 5   customer_id   405063 non-null  object 
 6   country       405063 non-null  object 
 7   description   403888 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 24.7+ MB


In [109]:
query = """
select ot.invoice,
       ot.stock_code,
       case when sd.description is null then 'UNKNOWN' else sd.description end as description,
       ot.quantity,
       cast(ot.invoice_date as datetime) as invoice_date,
       ot.price,
       ot.customer_id,
       ot.country
from bootcamp.online_transactions ot
left join (select *
           from bootcamp.stock_description
           where description <> '?') sd on ot.stock_code = sd.stock_code
where ot.customer_id <> ''
    and ot.stock_code not in ('D', 'M', 'CRUK', 'POST', 'BANK CHARGES')
"""

online_trans = pd.read_sql(query, connect)

In [110]:
online_trans.head()

Unnamed: 0,invoice,stock_code,description,quantity,invoice_date,price,customer_id,country
0,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom
1,536370,22900,SET 2 TEA TOWELS I LOVE LONDON,24,2010-12-01 08:45:00,2.95,u12583,France
2,536373,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 09:02:00,4.25,u1785,United Kingdom
3,536375,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 09:32:00,2.75,u1785,United Kingdom
4,536378,21212,PACK OF 72 RETROSPOT CAKE CASES,120,2010-12-01 09:37:00,0.42,u14688,United Kingdom


In [111]:
online_trans.shape

(405063, 8)

In [112]:
online_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405063 entries, 0 to 405062
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   invoice       405063 non-null  object        
 1   stock_code    405063 non-null  object        
 2   description   405063 non-null  object        
 3   quantity      405063 non-null  int64         
 4   invoice_date  405063 non-null  datetime64[ns]
 5   price         405063 non-null  float64       
 6   customer_id   405063 non-null  object        
 7   country       405063 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 24.7+ MB


In [113]:
online_trans[online_trans.description.isnull()]

Unnamed: 0,invoice,stock_code,description,quantity,invoice_date,price,customer_id,country


In [114]:
online_trans.description.value_counts().nlargest(20)

description
CREAM HANGING HEART T-LIGHT HOLDER    2077
REGENCY CAKESTAND 3 TIER              1905
JUMBO BAG RED RETROSPOT               1662
ASSORTED COLOUR BIRD ORNAMENT         1418
PARTY BUNTING                         1416
LUNCH BAG RED RETROSPOT               1359
SET OF 3 CAKE TINS PANTRY DESIGN      1232
UNKNOWN                               1175
LUNCH BAG  BLACK SKULL.               1126
POPCORN HOLDER                        1118
JUMBO BAG VINTAGE DOILEY              1115
LUNCH BAG SUKI DESIGN                 1103
PACK OF 72 RETROSPOT CAKE CASES       1080
LUNCH BAG VINTAGE DOILEY              1040
BUNTING , SPOTTY                      1036
PAPER CHAIN KIT 50'S CHRISTMAS        1029
LUNCH BAG SPACEBOY DESIGN             1021
LUNCH BAG CARS BLUE                   1012
NATURAL SLATE HEART CHALKBOARD         997
HEART OF WICKER SMALL                  996
Name: count, dtype: int64

In [116]:
# remove the duplicates 

online_trans.drop_duplicates(keep="first", inplace=True)

In [117]:
online_trans.shape

(399841, 8)

In [74]:
# load this data frame to s3

def connect_to_s3(aws_access_key_id, aws_secret_access_key):
    """Methods that connects to s3"""

    s3_client = boto3.client(
        "s3",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )

    print("Connection to s3 made")
    
    return s3_client

In [75]:
def df_to_s3(df, key, s3_bucket, aws_access_key_id, aws_secret_access_key):
    """Function that writes a data frame as a .csv file to a s3 bucket"""
    
    file_type = key[-4:]
    
    if file_type == '.pkl':
        buffer = BytesIO()  # create buffer to temporarily store the Data Frame
        df.to_pickle(buffer)  # code to write the data frame as .pkl file
    
    if file_type == '.csv':
        buffer = StringIO()  # create buffer to temporarily store the Data Frame
        df.to_csv(buffer, index=False)  # code to write the data frame as .csv file

    s3_client = connect_to_s3(aws_access_key_id, aws_secret_access_key)

    s3_client.put_object(
        Bucket=s3_bucket, Key=key, Body=buffer.getvalue()
    )  # this code writes the temp stored file and writes to s3


    print(f"The transformed data is saved as {file_type} in the following location s3://{s3_bucket}/{key}")

In [79]:
key = "transformations_thurs/sh_online_transactions_fixed.pkl"
s3_bucket = "sep-bootcamp"
# you need to hardcode these values
aws_access_key_id = os.getenv("aws_access_key_id")
aws_secret_access_key = os.getenv("aws_secret_access_key_id")


df_to_s3(online_trans, key, s3_bucket, aws_access_key_id, aws_secret_access_key)

Connection to s3 made
The transformed data is saved as .pkl in the following location s3://sep-bootcamp/transformations_thurs/sh_online_transactions_fixed.pkl
