They have requested that you create a new table called online_transactions_fixed:
- (SQL) They realised the stock code field can’t be trusted so they would like you to join description to this table, without question marks. They also want to keep track of cases where description is missing for future analysis so use a left join.	
- (SQL) They would like you to remove all rows of data where customer_id is null
- (SQL) They would like you to replace all missing values of Description with Unknown
- (Python) They would like you to remove any duplicated rows of data, and only keep the first appearance
- (SQL) They want you to remove the following stock codes
BANK CHARGES, POST, D, M, CRUK


In [22]:
import pandas as pd
import sqlite3
import boto3
from io import StringIO 

In [4]:
conn = sqlite3.connect("../../week16/data/bootcamp_db")

In [5]:
query = """
select *
from online_transactions
limit 10
"""

test = pd.read_sql(query, conn)

In [6]:
test

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
0,536370,21791,24,2010-12-01 08:45:00,1.25,u12583,France
1,536373,82494L,6,2010-12-01 09:02:00,2.55,u1785,United Kingdom
2,536378,21929,10,2010-12-01 09:37:00,1.95,u14688,United Kingdom
3,536381,37444A,1,2010-12-01 09:41:00,2.95,u15311,United Kingdom
4,536381,15056BL,2,2010-12-01 09:41:00,5.95,u15311,United Kingdom
5,536381,22438,1,2010-12-01 09:41:00,1.95,u15311,United Kingdom
6,536384,22469,40,2010-12-01 09:53:00,1.45,u18074,United Kingdom
7,536384,22189,4,2010-12-01 09:53:00,3.95,u18074,United Kingdom
8,536385,22783,1,2010-12-01 09:56:00,19.95,u1742,United Kingdom
9,536389,35004C,6,2010-12-01 10:03:00,5.45,u12431,Australia


In [7]:
query = """
select ot.*,
	   case when description is null then 'UNKNOWN' else description end as description
from online_transactions ot 
left join (select *
		   from stock_description
		   where description <> '?') sd on ot.stock_code = sd.stock_code
where customer_id <> ''
	and ot.stock_code not in ('BANK CHARGES', 'POST', 'D', 'M', 'CRUK')
"""

In [8]:
ot_w_desc = pd.read_sql(query, conn)

In [9]:
ot_w_desc

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536370,21791,24,2010-12-01 08:45:00,1.25,u12583,France,VINTAGE HEADS AND TAILS CARD GAME
1,536373,82494L,6,2010-12-01 09:02:00,2.55,u1785,United Kingdom,WOODEN FRAME ANTIQUE WHITE
2,536378,21929,10,2010-12-01 09:37:00,1.95,u14688,United Kingdom,JUMBO BAG PINK VINTAGE PAISLEY
3,536381,37444A,1,2010-12-01 09:41:00,2.95,u15311,United Kingdom,YELLOW BREAKFAST CUP AND SAUCER
4,536381,15056BL,2,2010-12-01 09:41:00,5.95,u15311,United Kingdom,EDWARDIAN PARASOL BLACK
...,...,...,...,...,...,...,...,...
405058,581580,22698,1,2011-12-09 12:20:00,2.95,u12748,United Kingdom,PINK REGENCY TEACUP AND SAUCER
405059,581584,20832,72,2011-12-09 12:25:00,0.72,u13777,United Kingdom,RED FLOCK LOVE HEART PHOTO FRAME
405060,581585,22178,12,2011-12-09 12:31:00,1.95,u15804,United Kingdom,VICTORIAN GLASS HANGING T-LIGHT
405061,581585,84692,25,2011-12-09 12:31:00,0.42,u15804,United Kingdom,BOX OF 24 COCKTAIL PARASOLS


In [10]:
ot_w_desc.shape

(405063, 8)

In [11]:
ot_w_desc.isnull().sum()

invoice         0
stock_code      0
quantity        0
invoice_date    0
price           0
customer_id     0
country         0
description     0
dtype: int64

In [12]:
# drop the duplicate rows of data, keeping the first appearance only

ot_cleaned = ot_w_desc.drop_duplicates(keep="first")

In [13]:
ot_cleaned.shape

(399841, 8)

In [14]:
# store as a .csv file to local data folder

ot_cleaned.to_csv("../data/ot_final.csv", index=False)

In [20]:
# check

check = pd.read_csv("../data/ot_final.csv")
check.head()

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536370,21791,24,2010-12-01 08:45:00,1.25,u12583,France,VINTAGE HEADS AND TAILS CARD GAME
1,536373,82494L,6,2010-12-01 09:02:00,2.55,u1785,United Kingdom,WOODEN FRAME ANTIQUE WHITE
2,536378,21929,10,2010-12-01 09:37:00,1.95,u14688,United Kingdom,JUMBO BAG PINK VINTAGE PAISLEY
3,536381,37444A,1,2010-12-01 09:41:00,2.95,u15311,United Kingdom,YELLOW BREAKFAST CUP AND SAUCER
4,536381,15056BL,2,2010-12-01 09:41:00,5.95,u15311,United Kingdom,EDWARDIAN PARASOL BLACK


In [21]:
check.shape

(399841, 8)

In [24]:
# you need to create variables to store the connection info

aws_access_key="AKIAYS2NSR6GGOCL5W6O"
aws_secret_access_key="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"
aws_s3_bucket = "waia-march-bootcamp"

In [25]:
# create a definition to connect to s3

def connect_to_s3(aws_access_key_id, aws_secret_access_key):
    """Methods that connects to s3"""

    s3_client = boto3.client(
        "s3",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )

    print("Connection to s3 made")

    return s3_client

In [26]:
def load_df_to_s3(df, key, s3_bucket, aws_access_key_id, aws_secret_access_key):
    """Function that writes a data frame as a .csv file to a s3 bucket"""
    
    s3_client = connect_to_s3(aws_access_key_id, aws_secret_access_key)
    
    csv_buffer = StringIO() # create buffer to temporarily store the Data Frame
    df.to_csv(csv_buffer, index=False) # code to write the data frame as csv file
    response = s3_client.put_object(
            Bucket=s3_bucket, Key=key, Body=csv_buffer.getvalue()
        ) # this code writes the temp stored csv file and writes to s3

    print(f"The transformed data in the following location s3://{s3_bucket}/{key}")

In [27]:
key = "etl/sh_online_trans_final.csv"

load_df_to_s3(ot_cleaned, key, aws_s3_bucket, aws_access_key, aws_secret_access_key)

Connection to s3 made
The transformed data in the following location s3://waia-march-bootcamp/etl/sh_online_trans_final.csv
