In [1]:
from data_extraction import DataExtractor
from data_cleaning import DataCleaning
import pandas as pd
from database_utils import DatabaseConnector

You currently have your 'stores_df' DataFrame downloaded as a csv file. To reduce the number of API calls you have made, you will convert the csv file into a DataFrame and use that to create the method for cleaning the DataFrame.

In [2]:
stores_df = pd.read_csv('stores_data.csv')

stores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          451 non-null    int64 
 1   address        447 non-null    object
 2   longitude      447 non-null    object
 3   lat            7 non-null      object
 4   locality       447 non-null    object
 5   store_code     448 non-null    object
 6   staff_numbers  448 non-null    object
 7   opening_date   448 non-null    object
 8   store_type     448 non-null    object
 9   latitude       447 non-null    object
 10  country_code   448 non-null    object
 11  continent      448 non-null    object
dtypes: int64(1), object(11)
memory usage: 42.4+ KB


In [3]:
store_code = "WEB-1388012W"
selected_rows = stores_df.loc[stores_df['store_code'] == store_code]
print(selected_rows)

   index address longitude  lat locality    store_code staff_numbers  \
0      0     NaN       NaN  NaN      NaN  WEB-1388012W           325   

  opening_date  store_type latitude country_code continent  
0   2010-06-12  Web Portal      NaN           GB    Europe  


In [4]:
desired_store_value = 'WEB-1388012W'
result_row = stores_df.loc[stores_df['store_code'] == desired_store_value]
print(result_row)

Empty DataFrame
Columns: [index, address, longitude, lat, locality, store_code, staff_numbers, opening_date, store_type, latitude, country_code, continent]
Index: []


In [4]:
# Create an instance of the DataCleaning class, passing our DataFrame as an argument
cleaner = DataCleaning(stores_df)

# Clean the stores_df DataFrame
clean_stores_df = cleaner.clean_store_data()

clean_stores_df.info()

DataFrame cleaning operation initiated.
<class 'pandas.core.frame.DataFrame'>
Index: 441 entries, 0 to 450
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   address        440 non-null    object
 1   longitude      440 non-null    object
 2   lat            0 non-null      object
 3   locality       440 non-null    object
 4   store_code     441 non-null    object
 5   staff_numbers  441 non-null    object
 6   opening_date   441 non-null    object
 7   store_type     441 non-null    object
 8   latitude       440 non-null    object
 9   country_code   441 non-null    object
 10  continent      441 non-null    object
dtypes: object(11)
memory usage: 41.3+ KB


In [5]:
sorted_store_codes= sorted(clean_stores_df['store_code'].unique())
print(sorted_store_codes)
clean_stores_df.head()

['AB-30E0C17F', 'AB-316DE4BA', 'AB-337AEBA7', 'AB-4ABD6D51', 'AB-917B715E', 'AB-D04AA29C', 'AB-D5ABF437', 'AB-D665986A', 'AB-FCFAB1A9', 'AL-34ABF27F', 'AL-91D3A03C', 'AL-9E1A5529', 'AL-EE8984F0', 'AL-FB5FAE9B', 'AR-0C5116E4', 'AR-2284FD32', 'AR-2D9D5CD3', 'AR-5E72668B', 'AR-ADA61B10', 'AR-CB0D95D8', 'AR-D3FDDD1D', 'AR-E8FDEB2B', 'AR-EE57502B', 'AR-F938FB8E', 'AS-373D10B2', 'AS-D26C759B', 'BA-25629F8E', 'BA-3A37248A', 'BA-41B2EB4B', 'BA-898BDED3', 'BA-8BA66AA0', 'BA-91C512D2', 'BA-99F54BD0', 'BA-B0933185', 'BA-B4AED588', 'BA-C0C7FDBE', 'BE-028A2748', 'BE-09F94471', 'BE-18074576', 'BE-2DE7E6FD', 'BE-6F48991F', 'BE-7C142381', 'BE-7D0EF647', 'BE-8C0CF738', 'BE-9A378A7F', 'BE-B069E157', 'BE-BC3E4F9A', 'BE-C5C7E600', 'BE-DAD46DCD', 'BE-F4596696', 'BE-F9462A80', 'BL-04FD6EF3', 'BL-190ABD7D', 'BL-8387506C', 'BL-9A86B74D', 'BL-D6CF153F', 'BO-17E7B6CE', 'BR-09D8DE69', 'BR-0DCD7EE0', 'BR-662EC74C', 'BR-B7C294A7', 'BR-BC499EDD', 'BR-CACF7508', 'BR-FB62A5BA', 'BU-0FB4CF87', 'BU-19BB50F6', 'BU-251A0

Unnamed: 0_level_0,address,longitude,lat,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,,,,,WEB-1388012W,325,2010-06-12,Web Portal,,GB,Europe
1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.1875,GB,Europe
4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.0233,,Belper,BE-18074576,35,2019-09-09,Local,-1.48119,GB,Europe


In [6]:
clean_stores_df['store_code'].unique()

array(['WEB-1388012W', 'HI-9B97EE4E', 'LA-0772C7B9', 'WE-1DE82CEE',
       'BE-18074576', 'GA-CAD01AC2', 'RU-C603E990', 'ST-229D997E',
       'KA-FA7ED3B8', 'HA-974352FE', 'RU-9F1136B4', 'SI-ECD52CD9',
       'DE-585399CF', 'CR-792AA8BB', 'HA-39A446E2', 'LA-9B0D9277',
       'NE-1D8B1D0C', 'LY-4C3D5D6C', 'CH-6A561423', 'BA-898BDED3',
       'EA-77ECA680', 'PO-38790FAE', 'BU-251A0E5A', 'EH-91356030',
       'AB-917B715E', 'SU-0B4C9A5F', 'IN-157E1191', 'GA-DA8EEA4A',
       'LA-2B59A825', 'WE-31C8B335', 'LA-F1042C48', 'EA-24B31935',
       'EA-7965E06D', 'PE-040B15C3', 'EH-DB8676C1', 'AL-91D3A03C',
       'HE-E39F4BC6', 'PO-47A01287', 'LA-D78C5F3F', 'ME-31958763',
       'AR-5E72668B', 'CO-CB3D8C89', 'CL-5C7C3198', 'BO-17E7B6CE',
       'CH-619E036C', 'BR-BC499EDD', 'NE-E50207AD', 'BE-8C0CF738',
       'NE-374D3983', 'SU-95D20AE9', 'ME-FB62E459', 'KA-653E783F',
       'MA-F0E23355', 'OS-70B2CD28', 'KI-A53AF10A', 'LE-63F3D33B',
       'VE-93DA8430', 'HI-BAD4DD1C', 'RU-1994A94D', 'LE-84C48

In [7]:
clean_stores_df['staff_numbers'].unique()

array(['325', '34', '92', '69', '35', '36', '31', '20', '32', '25', '138',
       '38', '39', '33', '109', '8', '119', '28', '29', '27', '78', '96',
       '135', '30', '90', '120', '50', '75', '118', '37', '67', '26',
       '40', '6', '22', '24', '83', '82', '117', '7', '21', '132', '131',
       '130', '70', '23', '51', '127', '4', '110', '5', '81', '85', '41',
       '57', '102', '103', '68', '99', '66', '124', '74', '87', '122',
       '112', '100', '71', '128', '76', '77', '137', '111', '58', '56',
       '72', '80', '101', '93', '73', '129', '60', '114', '52', '63',
       '113', '94', '134', '84', '59', '108', '97', '89', '53', '106',
       '48', '133', '107', '62', '86', '139', '98', '61'], dtype=object)

Upload the database to the engine.

In [8]:
# Now let's initialise the engine that we will upload our DataFrame to

# Create an instance of the DatabaseConnector class
database = DatabaseConnector()
# We will set the 'yaml_file_path' to ensure the connection is made to the upload database engine
upload_engine = database.init_db_engine(yaml_file_path='rds_upload_db_creds.yaml')

# Upload 'cleaned_df' to our initialised engine
database.upload_to_db(clean_stores_df, table_name='dim_store_details')

Called 'init_db_engine' method for database engine initialisation.
Read credentials from rds_upload_db_creds.yaml
Database credentials now stored in 'credentials' variable.
Database engine initialised successfully with the credentials in rds_upload_db_creds.yaml.
Data uploaded to the 'dim_store_details' table successfully.
