In [88]:
import yaml
from sqlalchemy import create_engine
from sqlalchemy import inspect
import pandas as pd
import numpy as np
from data_cleaning import DataCleaning
from data_extraction import DataExtractor
import tabula

if __name__ == '__main__':
    database_extractor = DataExtractor()
    creds_dict = database_extractor.read_db_creds("db_creds.yaml")
    db_engine = database_extractor.init_db_engine(creds_dict)
    user_table = database_extractor.read_rds_table("legacy_users")

In [96]:
extractor = DataExtractor()
product_data = extractor.extract_from_s3("s3://data-handling-public/products.csv")

In [138]:
product_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1846 entries, 0 to 1852
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1846 non-null   int64  
 1   product_name     1846 non-null   object 
 2   product_price_£  1846 non-null   object 
 3   weight           1826 non-null   float64
 4   category         1846 non-null   object 
 5   EAN              1846 non-null   object 
 6   date_added       1846 non-null   object 
 7   uuid             1846 non-null   object 
 8   removed          1846 non-null   object 
 9   product_code     1846 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 223.2+ KB


In [134]:
product_data.sample(7)

Unnamed: 0.1,Unnamed: 0,product_name,product_price_£,weight,category,EAN,date_added,uuid,removed,product_code
413,413,Lokken Bookcase - White,32.0,13.0,homeware,4708840280779,2010-08-06,db09feee-2d8e-4f0e-a1ce-69ddd60d5dd4,Still_avaliable,u7-7882575N
1627,1627,Rowntree's Randoms - Squidgy Swirls 92g,1.0,0.09,food-and-drink,3207083668272,1995-10-07,3192cd8a-e0cf-4078-b30e-14475cefda68,Still_avaliable,V6-6003579k
1700,1700,John West Pink Salmon in Brine 3 x 132g,3.99,0.33,food-and-drink,4758183010205,2006-07-19,6f5b78f6-02cd-4ef1-ba53-678109bf9294,Still_avaliable,L5-106382J
447,447,Tromso 1 Tier Shelf,10.0,1.5,homeware,2890946286716,2012-04-27,3296d7a6-3b98-4ed0-b16c-019925490b1b,Still_avaliable,Q6-1616904A
677,677,Candles with Wreath 2pk - Berry,5.0,0.06,homeware,960245689678,1994-07-12,0415bf6a-fd4c-4efe-a167-81d9ef0149fd,Still_avaliable,q9-5051383B
1661,1661,Jaffa Cakes Tube 500g,3.0,0.5,food-and-drink,8807409652869,2019-05-21,ea3f2cf7-dbe5-4401-81b0-e966bb1504c7,Still_avaliable,J6-2271628n
39,39,Paw Patrol Casting Station,12.99,0.88,toys-and-games,4248492315387,2016-08-13,7c1908d7-938e-43d6-985b-af7fd72bdcf2,Still_avaliable,l1-4267653w


In [117]:
product_data["date_added"] = pd.to_datetime(
    product_data["date_added"], errors="coerce"
)
# reformat the date column to YYYY-MM-DD
product_data["date_added"] = product_data["date_added"].dt.strftime(
    "%Y-%m-%d"
)

In [137]:
product_data.product_code.nunique()

1846

In [98]:
# one value is "77g .", removing " ."
product_data["weight"] = product_data["weight"].str.replace(" .", "", regex=True)

# Convert weights to decimal values in kg
for index, row in product_data.iterrows():
    weight = row["weight"]
    # some values are 3 x 20g, splitting them on "x", removing "g" and multiplying
    if "x" in weight:
        if weight.endswith("g"):
            weight = weight[:-1]
            substrings = weight.split("x")
            weight = round((float(substrings[0]) * float(substrings[1]) / 1000), 2)
        elif weight.endswith("ml"):
            weight = weight[:-2]
            substrings = weight.split("x")
            weight = round((float(substrings[0]) * float(substrings[1]) / 1000), 2)
    elif weight.endswith("kg"):
        weight = round((float(weight[:-2])), 2)
    elif weight.endswith("g"):
        weight = round((float(weight[:-1]) / 1000), 2)
    elif weight.endswith("ml"):
        weight = round((float(weight[:-2]) / 1000), 2)
    elif weight.endswith("oz"):
        weight = round((float(weight[:-2]) * 28.413 / 1000), 2)
    product_data.at[index, "weight"] = weight

In [97]:
# Drop rows that have NULL in all columns
product_data.drop(labels=[266, 788, 794, 1660], axis=0, inplace=True)

# some rows have random numbers in all rows
def product_corrupt_row_remover():
    """
    This function removes rows from a card table where the expiry date contains non-numeric
    characters except forward slash.
    """
    values = []
    for name in product_data["category"]:
        if pd.notnull(name):  # Check if the value is not NaN
            for letter in name:
                if letter in "1234567890":
                    values.append(name)
                    break
    indices = product_data[product_data["category"].isin(values)].index
    product_data.drop(indices, inplace=True)

product_corrupt_row_remover()

# rename Unnamed: 0 column to index
product_data.rename(columns={'Index': 'index'}, inplace=True)


product_data.rename(columns={'product_price': 'product_price_£'}, inplace=True)
product_data["product_price_£"] = product_data["product_price_£"].str.replace(
        "£", "")

In [115]:
product_data.sort_values(by='EAN', ascending=False)


Unnamed: 0.1,Unnamed: 0,product_name,product_price_£,weight,category,EAN,date_added,uuid,removed,product_code
652,652,Hanging Wicker Heart - White,2.50,0.08,homeware,9999808764157,2022-06-17,fa8cbcba-7cb9-4418-a9b0-a9f4205c0d04,Still_avaliable,i0-8924271N
925,925,Lush Paradise Gold Stacking Animals,2.00,0.57,homeware,9998964414937,2007-11-28,ce6caf9c-d3e6-4768-aead-0e0689c952f5,Still_avaliable,p7-0037173N
1497,1497,Russell Hobbs Groove Kettle 1.7L - Black & Gold,30.00,1.08,homeware,9997032224492,1995-10-13,8475b79b-60a6-4ec3-8beb-8b8c73291e44,Still_avaliable,W8-5377661G
1719,1719,Dynamite Hot Sauce 5pk,5.00,0.23,food-and-drink,9996110250187,1999-05-31,5646fb77-aa1f-41db-b04b-9625b58c4f53,Still_avaliable,L3-0341533H
992,992,Spaceways Padstow Metal Coat Stand - Black,22.00,3.20,homeware,9989410911650,2013-10-29,243fe9e1-f93b-401f-946b-08cd364a9356,Still_avaliable,Q0-2613353h
...,...,...,...,...,...,...,...,...,...,...
574,574,Spaceways 5 Tier Shoe Rack,10.00,1.50,homeware,1022053395369,2015-08-02,c9e81cc2-d99c-4325-9843-00fc6f70c280,Still_avaliable,J2-8271057t
1073,1073,Aspen Faux Fur Cushion 48 x 48cm - Dark Green,8.00,0.29,homeware,102124068130,2001-04-28,30754a95-14e2-4925-b4f1-3636bce2b049,Still_avaliable,M5-0946971n
1822,1822,Betty Crocker Vanilla Buttercream Style Icing ...,2.00,0.40,food-and-drink,1020758646588,2010-01-13,0417bd9f-54ee-41c8-adb5-0dc66ae24ff5,Still_avaliable,T8-8877377s
730,730,Oakland Traditional Woven Grey Check Curtain 6...,34.99,2.36,homeware,1011161304547,2009-01-06,1104a2f5-185a-44bf-b0f4-54082ea6dfa7,Still_avaliable,E2-8942426p


In [116]:
values = []
for name in product_data["EAN"]:
    if pd.notnull(name):  # Check if the value is not NaN
        for letter in name:
            if letter in "qwertyuiopasdfghjklmnbvcxzQWERTYUIOPLKJHGFDSAZXCVBNM!#$%&'()*+,:;?@[\]^_`{|}~":
                values.append(name)
                break
print(values)
#indices = store_data[store_data["staff_numbers"].isin(values)].index
#store_data.drop(indices, inplace=True)

[]


In [79]:
store_data['staff_numbers'] = store_data['staff_numbers'].str.replace('[a-zA-Z]', '')

  store_data['staff_numbers'] = store_data['staff_numbers'].str.replace('[a-zA-Z]', '')


In [70]:
card_table["expiry_date"].unique()

array(['09/26', '10/23', '09/27', '11/23', '07/27', '10/28', '11/27',
       '11/31', '01/29', '02/32', '08/28', '09/32', '05/30', '06/28',
       '07/24', '07/31', '03/32', '12/30', '12/25', '06/24', '10/30',
       '04/32', '05/28', '08/27', '02/23', '08/26', '02/24', '03/29',
       '06/25', '07/30', '01/24', '07/23', '06/31', '03/24', '02/25',
       '01/23', '02/30', '03/27', '09/25', '07/28', '03/28', '06/29',
       '04/23', '03/23', '05/27', '04/28', '07/32', '10/24', '11/26',
       '02/29', '08/25', '12/22', '07/29', '07/25', '05/31', '04/24',
       '05/25', '06/32', '05/32', '12/27', '09/31', '01/31', '11/25',
       '12/28', '10/27', '03/26', '03/31', '10/25', '08/29', '01/28',
       '06/30', '05/26', '04/29', '08/30', '12/23', '11/29', '02/31',
       '09/29', '09/28', '01/27', '05/24', '08/31', '03/25', '04/31',
       '11/24', '08/23', '04/30', '02/28', '01/30', '12/26', '10/26',
       '04/25', '12/31', '03/30', '11/30', '08/24', '06/23', '06/27',
       '10/31', '09/

In [6]:
user_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15320 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     15320 non-null  object
 1   last_name      15320 non-null  object
 2   date_of_birth  15320 non-null  object
 3   company        15320 non-null  object
 4   email_address  15320 non-null  object
 5   address        15320 non-null  object
 6   country        15320 non-null  object
 7   country_code   15320 non-null  object
 8   phone_number   15320 non-null  object
 9   join_date      15320 non-null  object
 10  user_uuid      15320 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


In [2]:
data_cleaner = DataCleaning()
user_table = data_cleaner.clean_user_data(user_table)

In [5]:
user_table.join_date.unique()

array(['2018-10-10', '2001-12-20', '2016-12-16', ..., '2016-04-15',
       '2021-03-07', '2015-08-28'], dtype=object)

In [16]:
#user_table.set_index("index", inplace=True)
user_table.sample(8)

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3405,Heinz-Willi,Schinke,2004-01-01,Gieß AG & Co. KGaA,karl-augustmatthaei@schueler.de,"Schuchhardtplatz 3/9, 58699 Bogen",Germany,DE,49136585690,2002-04-01,5a57b0c9-7411-4ad6-806e-becbb1cc8318
6445,Edeltraut,Riehl,1979-08-13,Hartung Atzler GmbH,korbinian84@pruschke.de,"Gretel-Hentschel-Platz 3/0, 23554 Wernigerode",Germany,DE,497590950165,2018-02-15,3ad79aa6-8c5f-493c-be47-0595eec2695a
7563,Birte,Höfig,1995-01-15,Schäfer GmbH & Co. KGaA,amielcarek@thies.net,"Gutknechtallee 532, 78970 Osterburg",Germany,DE,493265418030,1996-02-09,8ed35d68-04b7-4b64-bb10-2f4d34aa8469
7888,Cathrin,Stiebitz,1983-04-13,Hiller,christlknappe@heidrich.net,"Laszlo-Etzold-Ring 5/5, 72362 Schongau",Germany,DE,49998302017,2016-09-11,c7cd819a-bbff-49cd-80c9-483feb6041f1
6677,Joyce,Walsh,1973-09-07,Pearce-Wells,woodfiona@lee.org,"Flat 9, Miah summit, West Hayley, S4 3XW",United Kingdom,GB,441914960737,2008-11-10,7bf9b15d-c8fd-470b-86be-2d7487e29c44
3404,Emilie,Stahr,1941-07-14,Holt,jdoehn@schweitzer.de,"Stjepan-Preiß-Allee 0/5, 60361 Gerolzhofen",Germany,DE,499691167228,2005-10-20,5dd255d1-82bc-4628-8e8e-79ad1351dae5
14674,Debra,Townsend,1989-02-03,Wall-James,naomi47@brown.com,"5 Ian spur, Suttonton, HU9W 0RS",United Kingdom,GB,442920180156,1999-02-07,d83a65aa-3e5e-448e-8d62-63068a24a307
10433,Larissa,Kraushaar,1962-12-09,Schlosser KG,gspiess@mueller.de,"Thiesstr. 15, 97488 Bützow",Germany,DE,49366998195,2010-12-11,f7c5e6b6-43c8-40ea-8cbe-4e68c2d83f3f


In [36]:
user_table[user_table["phone_number"].str.contains("[qwertyuioplkjhgfdsazxcvbnm]", regex=True) & (user_table["join_date"] == "GB")]

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid


In [10]:
user_table.iloc[3187]

first_name                                             Chelsea
last_name                                               Murphy
date_of_birth                                       1940-04-28
company                                           Read-Harding
email_address                              jadepowell@bird.com
address          Flat 3, Preston curve, North Lindsey, ML9 6PU
country                                         United Kingdom
country_code                                                GB
phone_number                                    00449098790034
join_date                                           2017-07-17
user_uuid                 ad783032-0a1f-4632-8f33-b808d5918f88
Name: 3200, dtype: object

In [6]:
values = []
char = set()
for number in user_table["phone_number"]:
    for chars in number:
        if chars in "qwertyuioplkjhgfdsazxcvbnm!#$%&'()*.-+,/:;?@[\]^_`{|}~":
            values.append(number)
            char.add(chars)
            break
print(char)
#user_table[user_table['phone_number'].isin(values)]
#indices = user_table[user_table['first_name'].isin(values)].index
#user_table.drop(indices, inplace=True)

{'+', 'x', '('}


In [7]:
# check for nulls
user_table.isnull().sum()

first_name       0
last_name        0
date_of_birth    0
company          0
email_address    0
address          0
country          0
country_code     0
phone_number     0
join_date        0
user_uuid        0
dtype: int64