In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import tabula
import requests
import json
from database_utils import DatabaseConnector
from sqlalchemy import inspect

In [9]:
db_connector = DatabaseConnector()
source_credentials = db_connector.read_source_db_creds()
source_engine = db_connector.init_source_db_engine(source_credentials)
try:
    print(f"Connection to the {source_credentials['RDS_HOST']} for user {source_credentials['RDS_USER']} created successfully.")
except Exception as ex:
    print("Connection could not be made due to the following error: \n", ex)

Connection to the data-handling-project-readonly.cq2e8zno855e.eu-west-1.rds.amazonaws.com for user aicore_admin created successfully.


In [10]:
def list_db_tables(source_credentials):
        try:
            # Use the sqalchemy inspector to get table names
            inspector = inspect(source_engine)
            table_names = inspector.get_table_names()
            return table_names
        except Exception as e:
            print(f"Error listing tables: {e}")
            return None
        
list_db_tables(source_credentials)

['legacy_store_details', 'legacy_users', 'orders_table']

In [11]:
def read_rds_table(table_name):
        try:
            query = f"SELECT * FROM {table_name}"
            df = pd.read_sql_query(query, source_engine, index_col='index')
            return df
        except Exception as e:
            print(f"Error reading table {table_name}: {e}")
            return None

users_table_df = read_rds_table("legacy_users")

In [12]:
users_table_df.head()

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive\nJoanborough\nSK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow\nNew Tracy\nW22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass\nHunterborough\nNN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579


In [13]:
users_table_df.describe(include="all")

Unnamed: 0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
count,15320,15320,15320.0,15320,15320.0,15320.0,15320,15320,15320.0,15320.0,15320.0
unique,2178,1192,11360.0,12105,15300.0,15300.0,19,20,15092.0,8268.0,15300.0
top,Michael,Smith,,Smith Inc,,,United Kingdom,GB,,,
freq,57,340,21.0,22,21.0,21.0,9371,9365,21.0,21.0,21.0


In [14]:
users_table_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15320 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     15320 non-null  object
 1   last_name      15320 non-null  object
 2   date_of_birth  15320 non-null  object
 3   company        15320 non-null  object
 4   email_address  15320 non-null  object
 5   address        15320 non-null  object
 6   country        15320 non-null  object
 7   country_code   15320 non-null  object
 8   phone_number   15320 non-null  object
 9   join_date      15320 non-null  object
 10  user_uuid      15320 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


In [17]:
users_table_df.country_code.unique()

array(['DE', 'GB', 'US', 'VSM4IZ4EL3', 'NULL', 'QVUW9JSKY3', 'GGB',
       '0CU6LW3NKB', 'PG8MOC0UZI', 'NTCGYW8LVC', 'FB13AKRI21',
       'OS2P9CMHR6', '5D74J6FPFJ', 'LZGTB0T5Z7', 'IM8MN1L9MJ',
       'RVRFD92E48', 'XKI9UXSCZ1', 'QREF9WLI2A', 'XPVCZE2L8B',
       '44YAIDY048'], dtype=object)

In [18]:
users_table_df[~users_table_df['country_code'].isin(["GB", "DE", "US"])]

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
752,PYCLKLLC7I,W350SCUD6R,KBTI7FI7Y3,R7IZUNSQX0,3Q791B3VIY,YW2YXLOQ5J,I7G4DMDZOZ,VSM4IZ4EL3,A4Q4HQBI3I,JJ2PDVNPRO,W43MSCMQ88
867,,,,,,,,,,,
1023,,,,,,,,,,,
1047,GI4C78KWH0,UTB5PPYFG8,OFH8YGZJWN,CA1XGS8GZW,7HSZB429UK,63GXGYR3XL,AJ1ENKS3QL,QVUW9JSKY3,64ZO0ONUQO,AHN6EKASH3,BUE34OU973
1807,,,,,,,,,,,
2103,,,,,,,,,,,
2439,,,,,,,,,,,
2597,Peter,Mitchell,1942-01-11,"Jennings, Taylor and Davies",mlambert@riley.com,849 Collins camp\nSouth Francisland\nAL80 8TD,United Kingdom,GGB,0121 4960043,1995-07-15,bd3e3bc5-3a25-40dd-ac0f-3f2aaf5814df
6526,,,,,,,,,,,
2764,,,,,,,,,,,


"GGB" can be replaced with "GB"

In [19]:
users_table_df["country_code"] = users_table_df["country_code"].str.replace("GGB", "GB")

In [20]:
users_table_df[~users_table_df['country_code'].isin(["GB", "DE", "US"])]

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
752,PYCLKLLC7I,W350SCUD6R,KBTI7FI7Y3,R7IZUNSQX0,3Q791B3VIY,YW2YXLOQ5J,I7G4DMDZOZ,VSM4IZ4EL3,A4Q4HQBI3I,JJ2PDVNPRO,W43MSCMQ88
867,,,,,,,,,,,
1023,,,,,,,,,,,
1047,GI4C78KWH0,UTB5PPYFG8,OFH8YGZJWN,CA1XGS8GZW,7HSZB429UK,63GXGYR3XL,AJ1ENKS3QL,QVUW9JSKY3,64ZO0ONUQO,AHN6EKASH3,BUE34OU973
1807,,,,,,,,,,,
2103,,,,,,,,,,,
2439,,,,,,,,,,,
6526,,,,,,,,,,,
2764,,,,,,,,,,,
2997,DPAJNJL6PR,B8ZGN8ZJ84,PQPEUO937L,0YJ2FRMDB4,O5Q6D7FDAF,Y1GY1G3EM5,XGI7FM0VBJ,0CU6LW3NKB,DU9UJ42F3E,FYF2FAPZF3,56URKLG01W


In [21]:
keep_values = ["GB", "DE", "US"]
users_table_df = users_table_df[users_table_df["country_code"].isin(keep_values)]

In [24]:
users_table_df.describe(include="all")

Unnamed: 0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
count,15284,15284,15284,15284,15284,15284,15284,15284,15284,15284,15284
unique,2162,1176,11344,12089,15284,15284,3,3,15076,8252,15284
top,Michael,Smith,1958-02-02,Smith Inc,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,United Kingdom,GB,+44(0)808 1570796,2006-05-30,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
freq,57,340,6,22,1,1,9371,9371,3,8,1


In [26]:
users_table_df.duplicated().unique()

array([False])

There are no duplicated rows in the dataset.

In [27]:
users_table_df[users_table_df.isnull().any(axis=1)]

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


No null data remains