We are going to extract data from redshift and carry out the following transformation tasks
- (Y) remove all rows where customer id is missing - **SQL** or Python
- (Y) remove certain stock codes - **SQL** or Python
- (Y) add description to the online transactions table - **SQL** or Python
- remove all duplicate data - **Python**
- (Y) replace missing stock description with Unknown  - **SQL** or Python
- fix data type - Python or **SQL**


In [13]:
import pandas as pd
import psycopg2

# add if you want to remove warning messages
import warnings
warnings.filterwarnings("ignore")


In [14]:
# importing the variables with all the password
# PLEASE HARDCODE YOUR PASSWORDS

import os

from dotenv import load_dotenv
load_dotenv()  

dbname = os.getenv("dbname")
host = os.getenv("host")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")


In [15]:
# connecting to redshift with the definition

def connect_to_redshift(dbname, host, port, user, password):
    """Method that connects to redshift. This gives a warning so will look for another solution"""

    connect = psycopg2.connect(
        dbname=dbname, host=host, port=port, user=user, password=password
    )

    print("connection to redshift made")

    return connect

In [16]:
connect = connect_to_redshift(dbname, host, port, user, password)

connection to redshift made


In [17]:
# check everything is working

query = """select *
           from bootcamp.stock_description
           limit 10"""

pd.read_sql(query, connect)

Unnamed: 0,stock_code,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE
5,10124G,ARMY CAMO BOOKCOVER TAPE
6,10125,MINI FUNKY DESIGN TAPES
7,10133,COLOURING PENCILS BROWN TUBE
8,10135,COLOURING PENCILS BROWN TUBE
9,11001,ASSTD DESIGN RACING CAR PEN


In [44]:
query = """
        select ot.customer_id,
               ot.invoice,
               ot.stock_code,
               /*this is code to replace the missing values in description with unknown*/
               case when sd.description is null then 'Unknown'
                   else sd.description end as description,    
               /*this is code to fix the invoice_date's data type*/
               ot.price,
               ot.quantity,
               cast(ot.invoice_date as datetime) as invoice_date,
               ot.country
        from bootcamp.online_transactions ot
        left join (select *
                   from bootcamp.stock_description
                   where description <> '?') sd on ot.stock_code = sd.stock_code
        where ot.customer_id <> ''
            and ot.stock_code not in ('POST', 'D', 'M', 'CRUK', 'BANK CHARGES')
""" 

online_trans_w_desc = pd.read_sql(query, connect)
online_trans_w_desc

Unnamed: 0,customer_id,invoice,stock_code,description,price,quantity,invoice_date,country
0,u1785,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,6,2010-12-01 08:26:00,United Kingdom
1,u12583,536370,22900,SET 2 TEA TOWELS I LOVE LONDON,2.95,24,2010-12-01 08:45:00,France
2,u1785,536373,21730,GLASS STAR FROSTED T-LIGHT HOLDER,4.25,6,2010-12-01 09:02:00,United Kingdom
3,u1785,536375,84406B,CREAM CUPID HEARTS COAT HANGER,2.75,8,2010-12-01 09:32:00,United Kingdom
4,u14688,536378,21212,PACK OF 72 RETROSPOT CAKE CASES,0.42,120,2010-12-01 09:37:00,United Kingdom
...,...,...,...,...,...,...,...,...
405058,u12748,581580,22906,12 MESSAGE CARDS WITH ENVELOPES,1.65,1,2011-12-09 12:20:00,United Kingdom
405059,u12748,581580,23338,EGG FRYING PAN RED,2.08,1,2011-12-09 12:20:00,United Kingdom
405060,u12748,581580,22721,SET OF 3 CAKE TINS SKETCHBOOK,1.95,1,2011-12-09 12:20:00,United Kingdom
405061,u15804,581585,23356,LOVE HOT WATER BOTTLE,5.95,3,2011-12-09 12:31:00,United Kingdom


In [33]:
# check if we still have missing values

online_trans_w_desc.isnull().sum()

invoice         0
stock_code      0
quantity        0
invoice_date    0
price           0
customer_id     0
country         0
description     0
dtype: int64

In [37]:
online_trans_w_desc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405063 entries, 0 to 405062
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   invoice       405063 non-null  object        
 1   stock_code    405063 non-null  object        
 2   quantity      405063 non-null  int64         
 3   invoice_date  405063 non-null  datetime64[ns]
 4   price         405063 non-null  float64       
 5   customer_id   405063 non-null  object        
 6   country       405063 non-null  object        
 7   description   405063 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 24.7+ MB


In [45]:
online_trans_w_desc.shape

(405063, 8)

In [47]:
## drop duplicates - using python
# without definition

online_trans_final = online_trans_w_desc.drop_duplicates(keep='first')


In [57]:
# please do at the end 

def identify_and_drop_duplicates(df):
    """Function that will identify and drop all duplicates aside from the first appearance"""
    
    if df.duplicated().sum() > 0:
        print("# of duplicated row:", df.duplicated().sum())
        df_cleaned = df.drop_duplicates(keep="first")
    else:
        print("No duplicated found")
        df_cleaned = df
    
    return df_cleaned
    

In [58]:
# using the definition

online_trans_final = identify_and_drop_duplicates(online_trans_w_desc)

# of duplicated row: 5222


Unnamed: 0,customer_id,invoice,stock_code,description,price,quantity,invoice_date,country
0,u1785,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,6,2010-12-01 08:26:00,United Kingdom
1,u12583,536370,22900,SET 2 TEA TOWELS I LOVE LONDON,2.95,24,2010-12-01 08:45:00,France
2,u1785,536373,21730,GLASS STAR FROSTED T-LIGHT HOLDER,4.25,6,2010-12-01 09:02:00,United Kingdom
3,u1785,536375,84406B,CREAM CUPID HEARTS COAT HANGER,2.75,8,2010-12-01 09:32:00,United Kingdom
4,u14688,536378,21212,PACK OF 72 RETROSPOT CAKE CASES,0.42,120,2010-12-01 09:37:00,United Kingdom
...,...,...,...,...,...,...,...,...
405058,u12748,581580,22906,12 MESSAGE CARDS WITH ENVELOPES,1.65,1,2011-12-09 12:20:00,United Kingdom
405059,u12748,581580,23338,EGG FRYING PAN RED,2.08,1,2011-12-09 12:20:00,United Kingdom
405060,u12748,581580,22721,SET OF 3 CAKE TINS SKETCHBOOK,1.95,1,2011-12-09 12:20:00,United Kingdom
405061,u15804,581585,23356,LOVE HOT WATER BOTTLE,5.95,3,2011-12-09 12:31:00,United Kingdom


In [48]:
print(online_trans_w_desc.shape)
print(online_trans_final.shape)

(405063, 8)
(399841, 8)


In [49]:
## load to s3 - using python

import boto3
from io import StringIO, BytesIO

def connect_to_s3(aws_access_key_id, aws_secret_access_key):
    """Methods that connects to s3"""

    s3_client = boto3.client(
        "s3",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )

    print("Connection to s3 made")
    
    return s3_client


def df_to_s3(df, key, s3_bucket, aws_access_key_id, aws_secret_access_key):
    """Function that writes a data frame as a .csv file to a s3 bucket"""
    
    file_type = key[-4:]
    
    if file_type == '.pkl':
        buffer = BytesIO()  # create buffer to temporarily store the Data Frame
        df.to_pickle(buffer)  # code to write the data frame as .pkl file
    
    if file_type == '.csv':
        buffer = StringIO()  # create buffer to temporarily store the Data Frame
        df.to_csv(buffer, index=False)  # code to write the data frame as .csv file

    s3_client = connect_to_s3(aws_access_key_id, aws_secret_access_key)

    s3_client.put_object(
        Bucket=s3_bucket, Key=key, Body=buffer.getvalue()
    )  # this code writes the temp stored file and writes to s3


    print(f"The transformed data is saved as {file_type} in the following location s3://{s3_bucket}/{key}")

In [52]:
s3_bucket = 'july-bootcamp'
# you need to hardcode these values
aws_access_key_id = os.getenv("aws_access_key_id")
aws_secret_access_key = os.getenv("aws_secret_access_key_id")
key = "monday_transformations/sh_online_transactions_v2.pkl"

df_to_s3(online_trans_final, key, s3_bucket, aws_access_key_id, aws_secret_access_key)

Connection to s3 made
The transformed data is saved as .pkl in the following location s3://july-bootcamp/monday_transformations/sh_online_transactions_v2.pkl


In [60]:
# store your data frame locally

online_trans_final.to_pickle("../data/online_transactions_v2.pkl")