In [10]:
import yaml
from sqlalchemy import create_engine
from sqlalchemy import inspect
import pandas as pd
import numpy as np
from data_cleaning import DataCleaning
from data_extraction import DataExtractor
import tabula

if __name__ == '__main__':
    database_extractor = DataExtractor()
    creds_dict = database_extractor.read_db_creds("db_creds.yaml")
    db_engine = database_extractor.init_db_engine(creds_dict)
    user_table = database_extractor.read_rds_table("legacy_users")

In [13]:
data_cleaner = DataCleaning()
# clean user table
user_table = data_cleaner.clean_user_data(user_table)

In [15]:
extractor = DataExtractor()
# extract product data from AWS S3 bucket
product_data = extractor.extract_from_s3("s3://data-handling-public/products.csv")

data_cleaner = DataCleaning()
# clean product data
product_data = data_cleaner.clean_products_data(product_data)
# convert weights to kg
product_data = data_cleaner.convert_product_weights(product_data)

In [20]:
product_data = product_data.sort_values(by='EAN', ascending=True)
product_data.head(19)

Unnamed: 0,index,product_name,product_price_£,weight_kg,category,EAN,date_added,uuid,removed,product_code
1029,1029,Waffle Faux Fur Cushion - Natural,10.0,0.27,homeware,1007256382552,2001-01-29,503d7aa6-f7c2-4c2a-a61a-2c9976abf09f,Removed,e7-2939951Y
730,730,Oakland Traditional Woven Grey Check Curtain 6...,34.99,2.36,homeware,1011161304547,2009-01-06,1104a2f5-185a-44bf-b0f4-54082ea6dfa7,Still_avaliable,E2-8942426p
1822,1822,Betty Crocker Vanilla Buttercream Style Icing ...,2.0,0.4,food-and-drink,1020758646588,2010-01-13,0417bd9f-54ee-41c8-adb5-0dc66ae24ff5,Still_avaliable,T8-8877377s
1073,1073,Aspen Faux Fur Cushion 48 x 48cm - Dark Green,8.0,0.29,homeware,102124068130,2001-04-28,30754a95-14e2-4925-b4f1-3636bce2b049,Still_avaliable,M5-0946971n
574,574,Spaceways 5 Tier Shoe Rack,10.0,1.5,homeware,1022053395369,2015-08-02,c9e81cc2-d99c-4325-9843-00fc6f70c280,Still_avaliable,J2-8271057t
1248,1248,White Vase with Berries,6.0,0.35,homeware,1029706705827,1998-03-09,27b7370c-7b2b-473b-9ff4-9127fa96ec22,Still_avaliable,i7-488835x
517,517,Tribal Mono Tufted Cushion - Black/White,12.0,0.68,homeware,103226619541,2009-12-26,2164fae5-2516-4ced-8763-7c4cfd8793b9,Still_avaliable,N2-9174372P
862,862,Alva Clothes Rail,20.0,2.5,homeware,104966681003,2010-03-12,ce64af0b-aaf0-4566-a20f-801a80c53b68,Still_avaliable,i2-0122662z
223,223,"Fisher-Price Click, Clack & Stack Gift Set",12.99,0.43,toys-and-games,1050712861443,2015-04-07,d798ce9d-2997-4dc4-8906-9ad0fe52f3e8,Still_avaliable,V8-1435047W
948,948,Tromso 4 Tier Shelving Unit,30.0,4.2,homeware,1055904033394,2008-08-16,890aefed-11b9-4c9f-b2ef-c129f6e487e9,Still_avaliable,g4-324960M


In [25]:
max_length = product_data[product_data["EAN"].apply(lambda x: len(x) != 13)]
max_length



Unnamed: 0,index,product_name,product_price_£,weight_kg,category,EAN,date_added,uuid,removed,product_code
1073,1073,Aspen Faux Fur Cushion 48 x 48cm - Dark Green,8.00,0.29,homeware,102124068130,2001-04-28,30754a95-14e2-4925-b4f1-3636bce2b049,Still_avaliable,M5-0946971n
517,517,Tribal Mono Tufted Cushion - Black/White,12.00,0.68,homeware,103226619541,2009-12-26,2164fae5-2516-4ced-8763-7c4cfd8793b9,Still_avaliable,N2-9174372P
862,862,Alva Clothes Rail,20.00,2.50,homeware,104966681003,2010-03-12,ce64af0b-aaf0-4566-a20f-801a80c53b68,Still_avaliable,i2-0122662z
1612,1612,Skittles Giant Sour Pouch 141g,1.00,0.14,food-and-drink,110596474830,2014-02-11,657862b1-0513-40d1-bb28-f586fef29a2f,Still_avaliable,Q9-2687893d
932,932,Lokken 4 Tier Ladder Shelf Unit,40.00,11.60,homeware,112397048753,2017-10-07,fb6c257f-2798-46b5-8d6d-a8b83e64f3c5,Still_avaliable,C9-3091977Q
...,...,...,...,...,...,...,...,...,...,...
1834,1834,UniBond No More Nails Click & Fix 30g,5.99,0.03,diy,972534995193,2005-02-21,caba2f0f-e1c6-41ac-93db-15b172fa1d0a,Still_avaliable,W5-0028619U
1117,1117,Small Donut Candle Holder,4.00,0.23,homeware,973615733888,1999-10-16,0b105bab-fa21-4459-bfb3-d946c2ef3910,Still_avaliable,H6-6666714e
1084,1084,Shell Cushion - Blush,10.00,0.49,homeware,974336664505,1997-04-03,bcfd864d-fae3-4f6d-9742-0b2bc99062d6,Still_avaliable,T9-6275187s
765,765,Maine Ladder Towel Rack,12.00,2.10,homeware,994484911788,2013-05-28,1af53a3a-5874-4afa-bd07-7653619aa9d1,Still_avaliable,Y4-6952691s


In [1]:
from data_extraction import DataExtractor
from data_cleaning import DataCleaning


database_extractor = DataExtractor()
# read card pdf to
card_table = database_extractor.retrieve_pdf_data(
    "https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf"
)

data_cleaner = DataCleaning()
# clean card table
card_table = data_cleaner.clean_card_data(card_table)

In [2]:
card_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15284 entries, 0 to 18
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   card_number             15284 non-null  object
 1   expiry_date             15284 non-null  object
 2   card_provider           15284 non-null  object
 3   date_payment_confirmed  15284 non-null  object
dtypes: object(4)
memory usage: 597.0+ KB


In [4]:
#card_table = card_table.sort_values(by='card_number', ascending=False)
card_table.head(12)

Unnamed: 0,card_number,expiry_date,card_provider,date_payment_confirmed
0,30060773296197,09/26,Diners Club / Carte Blanche,2015-11-25
1,349624180933183,10/23,American Express,2001-06-18
2,3529023891650490,06/23,JCB 16 digit,2000-12-26
3,213142929492281,09/27,JCB 15 digit,2011-02-12
4,502067329974,10/25,Maestro,1997-03-13
5,3506661913512980,11/23,JCB 16 digit,2003-08-25
6,377549437870679,07/27,American Express,2006-12-11
7,2321541881278150,02/29,Mastercard,1995-08-24
8,6011037917693140,02/24,Discover,1998-10-23
9,502049986008,07/23,Maestro,2011-04-30


In [3]:
x = card_table["card_number"].astype(str)
card_table[x.str.contains("3529023891650490")]

Unnamed: 0,card_number,expiry_date,card_provider,date_payment_confirmed
2,3529023891650490,06/23,JCB 16 digit,2000-12-26


In [5]:
card_table.expiry_date.unique()

array(['09/26', '10/23', '06/23', '09/27', '10/25', '11/23', '07/27',
       '02/29', '02/24', '07/23', '10/28', '11/27', '11/31', '10/29',
       '01/29', '01/26', '02/32', '10/26', '08/28', '09/32', '05/30',
       '02/23', '06/28', '07/24', '07/31', '10/30', '03/32', '12/30',
       '10/31', '12/25', '06/24', '04/32', '05/28', '08/23', '08/27',
       '08/26', '03/29', '06/25', '07/30', '01/24', '06/31', '03/24',
       '02/25', '01/23', '02/30', '03/27', '07/26', '09/25', '07/28',
       '03/28', '08/24', '06/29', '06/30', '04/23', '12/29', '03/23',
       '05/27', '04/28', '05/26', '07/32', '10/24', '01/32', '11/26',
       '02/28', '08/25', '02/31', '12/22', '07/29', '07/25', '05/31',
       '04/24', '05/25', '06/32', '05/32', '12/27', '09/31', '01/31',
       '11/25', '12/28', '10/27', '03/26', '03/31', '04/31', '08/29',
       '01/28', '09/30', '04/27', '04/29', '08/30', '12/23', '11/29',
       '09/23', '09/29', '09/28', '01/27', '05/24', '08/31', '03/25',
       '11/24', '04/

In [5]:
card_table = card_table[~card_table['card_number'].astype(str).str.contains('[a-zA-Z]')]

In [21]:
values = []
for name in card_table["card_number"]:
    if any(letter.isalpha() for letter in str(name)):
        values.append(name)

indices = card_table[card_table["card_number"].isin(values)].index
card_table.drop(indices, inplace=True)
#print(values)

In [270]:
import pandas as pd
import requests

url = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json'
response = requests.get(url)
data = response.json()

date_events = pd.DataFrame(data)

In [147]:
order_data.drop(labels=["level_0"], axis=1, inplace=True)

In [207]:
date_events.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120123 entries, 0 to 120160
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   timestamp    120123 non-null  object
 1   month        120123 non-null  object
 2   year         120123 non-null  object
 3   day          120123 non-null  object
 4   time_period  120123 non-null  object
 5   date_uuid    120123 non-null  object
dtypes: object(6)
memory usage: 6.4+ MB


In [278]:
date_events.sample(13)

Unnamed: 0,timestamp,month,year,day,time_period,date_uuid
45037,21:24:11,9,1995,25,Evening,97e4250d-7ad4-4323-92d3-a87aaa238ee9
2935,21:16:47,4,2022,22,Evening,93f34ac3-4528-4229-a22f-df814915594c
32329,20:34:41,6,2011,15,Evening,23de9580-f5d4-486e-a58d-8d8b80786adc
225,13:28:21,1,2003,19,Midday,4aacc34e-c8bd-429e-99ed-3679992c455b
61620,21:33:05,10,2000,28,Evening,5cd3a202-b9fe-4105-bc90-a34c49a8ba8e
38964,18:39:23,10,2004,30,Evening,86213e67-9af4-4d95-bf2c-bbc0a9ef39b6
3644,18:08:16,2,2003,17,Evening,92d6e442-be0c-4956-9ca7-9201fa3c3fe9
22818,13:47:50,6,1997,20,Midday,d471167c-d2df-488a-b2d4-d54f560b9f66
20308,20:55:54,2,1993,10,Evening,60d1474e-8a4e-4153-9f0d-1da7113ba4c5
53191,19:02:37,6,2018,18,Evening,f17019aa-a813-42c4-b22e-339d12a06f42


In [277]:
date_events.month.unique()

array(['9', '2', '4', '11', '12', '8', '1', '3', '7', '10', '5', '6'],
      dtype=object)

In [276]:
values = []
for name in date_events["month"]:
    for letter in name:
        if letter in "qwertyuiopasdfghjklmnbvcxzQWERTYUIOPLKJHGFDSAZXCVBNM!#$%&'()*+,/:;?@[\]^_`{|}~":
            values.append(name)
            break
indices = date_events[date_events["month"].isin(values)].index
date_events.drop(indices, inplace=True)

In [273]:
null_rows = date_events[date_events.astype(str).apply(lambda x: x.str.contains('NULL')).any(axis=1)]
null_rows

Unnamed: 0,timestamp,month,year,day,time_period,date_uuid
11873,,,,,,
14280,,,,,,
20358,,,,,,
23525,,,,,,
23794,,,,,,
27347,,,,,,
35767,,,,,,
50988,,,,,,
55040,,,,,,
67893,,,,,,


In [159]:
values = []
for name in order_data["card_number"]:
    name = str(name)
    if pd.notnull(name):  # Check if the value is not NaN
        for letter in name:
            if letter in "qwertyuiopasdfghjklmnbvcxzQWERTYUIOPLKJHGFDSAZXCVBNM!#$%&'()*+,:;?@[\]^_`{|}~":
                values.append(name)
                break
print(values)
#indices = store_data[store_data["staff_numbers"].isin(values)].index
#store_data.drop(indices, inplace=True)

[]


In [98]:
# one value is "77g .", removing " ."
product_data["weight"] = product_data["weight"].str.replace(" .", "", regex=True)

# Convert weights to decimal values in kg
for index, row in product_data.iterrows():
    weight = row["weight"]
    # some values are 3 x 20g, splitting them on "x", removing "g" and multiplying
    if "x" in weight:
        if weight.endswith("g"):
            weight = weight[:-1]
            substrings = weight.split("x")
            weight = round((float(substrings[0]) * float(substrings[1]) / 1000), 2)
        elif weight.endswith("ml"):
            weight = weight[:-2]
            substrings = weight.split("x")
            weight = round((float(substrings[0]) * float(substrings[1]) / 1000), 2)
    elif weight.endswith("kg"):
        weight = round((float(weight[:-2])), 2)
    elif weight.endswith("g"):
        weight = round((float(weight[:-1]) / 1000), 2)
    elif weight.endswith("ml"):
        weight = round((float(weight[:-2]) / 1000), 2)
    elif weight.endswith("oz"):
        weight = round((float(weight[:-2]) * 28.413 / 1000), 2)
    product_data.at[index, "weight"] = weight

In [97]:
# Drop rows that have NULL in all columns
product_data.drop(labels=[266, 788, 794, 1660], axis=0, inplace=True)

# some rows have random numbers in all rows
def product_corrupt_row_remover():
    """
    This function removes rows from a card table where the expiry date contains non-numeric
    characters except forward slash.
    """
    values = []
    for name in product_data["category"]:
        if pd.notnull(name):  # Check if the value is not NaN
            for letter in name:
                if letter in "1234567890":
                    values.append(name)
                    break
    indices = product_data[product_data["category"].isin(values)].index
    product_data.drop(indices, inplace=True)

product_corrupt_row_remover()

# rename Unnamed: 0 column to index
product_data.rename(columns={'Index': 'index'}, inplace=True)


product_data.rename(columns={'product_price': 'product_price_£'}, inplace=True)
product_data["product_price_£"] = product_data["product_price_£"].str.replace(
        "£", "")

In [115]:
product_data.sort_values(by='EAN', ascending=False)


Unnamed: 0.1,Unnamed: 0,product_name,product_price_£,weight,category,EAN,date_added,uuid,removed,product_code
652,652,Hanging Wicker Heart - White,2.50,0.08,homeware,9999808764157,2022-06-17,fa8cbcba-7cb9-4418-a9b0-a9f4205c0d04,Still_avaliable,i0-8924271N
925,925,Lush Paradise Gold Stacking Animals,2.00,0.57,homeware,9998964414937,2007-11-28,ce6caf9c-d3e6-4768-aead-0e0689c952f5,Still_avaliable,p7-0037173N
1497,1497,Russell Hobbs Groove Kettle 1.7L - Black & Gold,30.00,1.08,homeware,9997032224492,1995-10-13,8475b79b-60a6-4ec3-8beb-8b8c73291e44,Still_avaliable,W8-5377661G
1719,1719,Dynamite Hot Sauce 5pk,5.00,0.23,food-and-drink,9996110250187,1999-05-31,5646fb77-aa1f-41db-b04b-9625b58c4f53,Still_avaliable,L3-0341533H
992,992,Spaceways Padstow Metal Coat Stand - Black,22.00,3.20,homeware,9989410911650,2013-10-29,243fe9e1-f93b-401f-946b-08cd364a9356,Still_avaliable,Q0-2613353h
...,...,...,...,...,...,...,...,...,...,...
574,574,Spaceways 5 Tier Shoe Rack,10.00,1.50,homeware,1022053395369,2015-08-02,c9e81cc2-d99c-4325-9843-00fc6f70c280,Still_avaliable,J2-8271057t
1073,1073,Aspen Faux Fur Cushion 48 x 48cm - Dark Green,8.00,0.29,homeware,102124068130,2001-04-28,30754a95-14e2-4925-b4f1-3636bce2b049,Still_avaliable,M5-0946971n
1822,1822,Betty Crocker Vanilla Buttercream Style Icing ...,2.00,0.40,food-and-drink,1020758646588,2010-01-13,0417bd9f-54ee-41c8-adb5-0dc66ae24ff5,Still_avaliable,T8-8877377s
730,730,Oakland Traditional Woven Grey Check Curtain 6...,34.99,2.36,homeware,1011161304547,2009-01-06,1104a2f5-185a-44bf-b0f4-54082ea6dfa7,Still_avaliable,E2-8942426p


In [79]:
store_data['staff_numbers'] = store_data['staff_numbers'].str.replace('[a-zA-Z]', '')

  store_data['staff_numbers'] = store_data['staff_numbers'].str.replace('[a-zA-Z]', '')


In [70]:
card_table["expiry_date"].unique()

array(['09/26', '10/23', '09/27', '11/23', '07/27', '10/28', '11/27',
       '11/31', '01/29', '02/32', '08/28', '09/32', '05/30', '06/28',
       '07/24', '07/31', '03/32', '12/30', '12/25', '06/24', '10/30',
       '04/32', '05/28', '08/27', '02/23', '08/26', '02/24', '03/29',
       '06/25', '07/30', '01/24', '07/23', '06/31', '03/24', '02/25',
       '01/23', '02/30', '03/27', '09/25', '07/28', '03/28', '06/29',
       '04/23', '03/23', '05/27', '04/28', '07/32', '10/24', '11/26',
       '02/29', '08/25', '12/22', '07/29', '07/25', '05/31', '04/24',
       '05/25', '06/32', '05/32', '12/27', '09/31', '01/31', '11/25',
       '12/28', '10/27', '03/26', '03/31', '10/25', '08/29', '01/28',
       '06/30', '05/26', '04/29', '08/30', '12/23', '11/29', '02/31',
       '09/29', '09/28', '01/27', '05/24', '08/31', '03/25', '04/31',
       '11/24', '08/23', '04/30', '02/28', '01/30', '12/26', '10/26',
       '04/25', '12/31', '03/30', '11/30', '08/24', '06/23', '06/27',
       '10/31', '09/

In [6]:
user_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15320 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     15320 non-null  object
 1   last_name      15320 non-null  object
 2   date_of_birth  15320 non-null  object
 3   company        15320 non-null  object
 4   email_address  15320 non-null  object
 5   address        15320 non-null  object
 6   country        15320 non-null  object
 7   country_code   15320 non-null  object
 8   phone_number   15320 non-null  object
 9   join_date      15320 non-null  object
 10  user_uuid      15320 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


In [2]:
data_cleaner = DataCleaning()
user_table = data_cleaner.clean_user_data(user_table)

In [5]:
user_table.join_date.unique()

array(['2018-10-10', '2001-12-20', '2016-12-16', ..., '2016-04-15',
       '2021-03-07', '2015-08-28'], dtype=object)

In [16]:
#user_table.set_index("index", inplace=True)
user_table.sample(8)

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3405,Heinz-Willi,Schinke,2004-01-01,Gieß AG & Co. KGaA,karl-augustmatthaei@schueler.de,"Schuchhardtplatz 3/9, 58699 Bogen",Germany,DE,49136585690,2002-04-01,5a57b0c9-7411-4ad6-806e-becbb1cc8318
6445,Edeltraut,Riehl,1979-08-13,Hartung Atzler GmbH,korbinian84@pruschke.de,"Gretel-Hentschel-Platz 3/0, 23554 Wernigerode",Germany,DE,497590950165,2018-02-15,3ad79aa6-8c5f-493c-be47-0595eec2695a
7563,Birte,Höfig,1995-01-15,Schäfer GmbH & Co. KGaA,amielcarek@thies.net,"Gutknechtallee 532, 78970 Osterburg",Germany,DE,493265418030,1996-02-09,8ed35d68-04b7-4b64-bb10-2f4d34aa8469
7888,Cathrin,Stiebitz,1983-04-13,Hiller,christlknappe@heidrich.net,"Laszlo-Etzold-Ring 5/5, 72362 Schongau",Germany,DE,49998302017,2016-09-11,c7cd819a-bbff-49cd-80c9-483feb6041f1
6677,Joyce,Walsh,1973-09-07,Pearce-Wells,woodfiona@lee.org,"Flat 9, Miah summit, West Hayley, S4 3XW",United Kingdom,GB,441914960737,2008-11-10,7bf9b15d-c8fd-470b-86be-2d7487e29c44
3404,Emilie,Stahr,1941-07-14,Holt,jdoehn@schweitzer.de,"Stjepan-Preiß-Allee 0/5, 60361 Gerolzhofen",Germany,DE,499691167228,2005-10-20,5dd255d1-82bc-4628-8e8e-79ad1351dae5
14674,Debra,Townsend,1989-02-03,Wall-James,naomi47@brown.com,"5 Ian spur, Suttonton, HU9W 0RS",United Kingdom,GB,442920180156,1999-02-07,d83a65aa-3e5e-448e-8d62-63068a24a307
10433,Larissa,Kraushaar,1962-12-09,Schlosser KG,gspiess@mueller.de,"Thiesstr. 15, 97488 Bützow",Germany,DE,49366998195,2010-12-11,f7c5e6b6-43c8-40ea-8cbe-4e68c2d83f3f


In [36]:
user_table[user_table["phone_number"].str.contains("[qwertyuioplkjhgfdsazxcvbnm]", regex=True) & (user_table["join_date"] == "GB")]

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid


In [10]:
user_table.iloc[3187]

first_name                                             Chelsea
last_name                                               Murphy
date_of_birth                                       1940-04-28
company                                           Read-Harding
email_address                              jadepowell@bird.com
address          Flat 3, Preston curve, North Lindsey, ML9 6PU
country                                         United Kingdom
country_code                                                GB
phone_number                                    00449098790034
join_date                                           2017-07-17
user_uuid                 ad783032-0a1f-4632-8f33-b808d5918f88
Name: 3200, dtype: object

In [6]:
values = []
char = set()
for number in user_table["phone_number"]:
    for chars in number:
        if chars in "qwertyuioplkjhgfdsazxcvbnm!#$%&'()*.-+,/:;?@[\]^_`{|}~":
            values.append(number)
            char.add(chars)
            break
print(char)
#user_table[user_table['phone_number'].isin(values)]
#indices = user_table[user_table['first_name'].isin(values)].index
#user_table.drop(indices, inplace=True)

{'+', 'x', '('}


In [7]:
# check for nulls
user_table.isnull().sum()

first_name       0
last_name        0
date_of_birth    0
company          0
email_address    0
address          0
country          0
country_code     0
phone_number     0
join_date        0
user_uuid        0
dtype: int64