In [1]:
import yaml
import json
import tabula
import requests
import calendar
import numpy as np
import re as regex
import pandas as pd
from datetime import datetime
from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

In [2]:
number_of_stores = DataExtractor.list_number_of_stores()
stores_df = DataExtractor.retrieve_stores_data(number_of_stores)


In [3]:
stores_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 451 entries, 0 to 450
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   address        451 non-null    object
 1   longitude      451 non-null    object
 2   lat            11 non-null     object
 3   locality       451 non-null    object
 4   store_code     451 non-null    object
 5   staff_numbers  451 non-null    object
 6   opening_date   451 non-null    object
 7   store_type     451 non-null    object
 8   latitude       450 non-null    object
 9   country_code   450 non-null    object
 10  continent      450 non-null    object
dtypes: object(11)
memory usage: 42.3+ KB


In [4]:
stores_df.isna().sum()

address            0
longitude          0
lat              440
locality           0
store_code         0
staff_numbers      0
opening_date       0
store_type         0
latitude           1
country_code       1
continent          1
dtype: int64

In [5]:
stores_df.head(10)

Unnamed: 0_level_0,address,longitude,lat,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,,,,,WEB-1388012W,325,2010-06-12,Web Portal,,,
1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.1875,GB,Europe
4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.0233,,Belper,BE-18074576,35,2019-09-09,Local,-1.48119,GB,Europe
5,Flat 92u\nChristian harbors\nPort Charlotte\nN...,53.38333,,Gainsborough,GA-CAD01AC2,36,1995-05-15,Local,-0.76667,GB,Europe
6,"7 Gillian rue\nWest Robertside\nPH4 8NY, Ruthe...",55.82885,,Rutherglen,RU-C603E990,92,2001-01-04,Super Store,-4.21376,GB,Europe
7,"Lilija-Heß-Allee 660\n34566 Regensburg, Stuttgart",48.78232,,Stuttgart,ST-229D997E,34,2000-06-01,Local,9.17702,DE,Europe
8,"510 Jill Mill\nSouth Laura, FL 38723, Kaukauna",44.27804,,Kaukauna,KA-FA7ED3B8,31,2022-09-05,Local,-88.27205,US,America
9,"3 Lee valleys\nWest Janetview\nDY4M 2RL, Hartley",51.38673,,Hartley,HA-974352FE,20,2004-09-11,Local,0.30367,GB,Europe


In [6]:
stores_df['opening_date'] = pd.to_datetime(stores_df['opening_date'], errors='coerce')

In [7]:
stores_df = stores_df.loc[stores_df['opening_date'].notna()]

In [8]:
stores_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 441 entries, 0 to 450
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   address        441 non-null    object        
 1   longitude      441 non-null    object        
 2   lat            1 non-null      object        
 3   locality       441 non-null    object        
 4   store_code     441 non-null    object        
 5   staff_numbers  441 non-null    object        
 6   opening_date   441 non-null    datetime64[ns]
 7   store_type     441 non-null    object        
 8   latitude       440 non-null    object        
 9   country_code   440 non-null    object        
 10  continent      440 non-null    object        
dtypes: datetime64[ns](1), object(10)
memory usage: 41.3+ KB


In [9]:
stores_df.head(10)

Unnamed: 0_level_0,address,longitude,lat,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,,,,,WEB-1388012W,325,2010-06-12,Web Portal,,,
1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.1875,GB,Europe
4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.0233,,Belper,BE-18074576,35,2019-09-09,Local,-1.48119,GB,Europe
5,Flat 92u\nChristian harbors\nPort Charlotte\nN...,53.38333,,Gainsborough,GA-CAD01AC2,36,1995-05-15,Local,-0.76667,GB,Europe
6,"7 Gillian rue\nWest Robertside\nPH4 8NY, Ruthe...",55.82885,,Rutherglen,RU-C603E990,92,2001-01-04,Super Store,-4.21376,GB,Europe
7,"Lilija-Heß-Allee 660\n34566 Regensburg, Stuttgart",48.78232,,Stuttgart,ST-229D997E,34,2000-06-01,Local,9.17702,DE,Europe
8,"510 Jill Mill\nSouth Laura, FL 38723, Kaukauna",44.27804,,Kaukauna,KA-FA7ED3B8,31,2022-09-05,Local,-88.27205,US,America
9,"3 Lee valleys\nWest Janetview\nDY4M 2RL, Hartley",51.38673,,Hartley,HA-974352FE,20,2004-09-11,Local,0.30367,GB,Europe


In [10]:
stores_df.reset_index(drop=True, inplace=True)
stores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   address        441 non-null    object        
 1   longitude      441 non-null    object        
 2   lat            1 non-null      object        
 3   locality       441 non-null    object        
 4   store_code     441 non-null    object        
 5   staff_numbers  441 non-null    object        
 6   opening_date   441 non-null    datetime64[ns]
 7   store_type     441 non-null    object        
 8   latitude       440 non-null    object        
 9   country_code   440 non-null    object        
 10  continent      440 non-null    object        
dtypes: datetime64[ns](1), object(10)
memory usage: 38.0+ KB


In [11]:
stores_df.drop(columns='lat', inplace=True)
stores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   address        441 non-null    object        
 1   longitude      441 non-null    object        
 2   locality       441 non-null    object        
 3   store_code     441 non-null    object        
 4   staff_numbers  441 non-null    object        
 5   opening_date   441 non-null    datetime64[ns]
 6   store_type     441 non-null    object        
 7   latitude       440 non-null    object        
 8   country_code   440 non-null    object        
 9   continent      440 non-null    object        
dtypes: datetime64[ns](1), object(9)
memory usage: 34.6+ KB


In [79]:
stores_df.replace(['NULL', 'N/A'], None, inplace=True)


In [110]:
copy_stores_df = stores_df.copy()

stores_df['staff_numbers'] = stores_df['staff_numbers'].str.replace('[A-Z_a-z\W]', '', regex=True)

mask = stores_df['staff_numbers'].str.contains('\d*[A-Za-z]+\d*', regex=True)
stores_df[~mask]

Unnamed: 0,address,longitude,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
0,,,,WEB-1388012W,325,2010-06-12,Web Portal,,,
1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26000,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.18750,GB,Europe
4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.02330,Belper,BE-18074576,35,2019-09-09,Local,-1.48119,GB,Europe
...,...,...,...,...,...,...,...,...,...,...
436,"Flat 7\nStephanie lake\nMorrisside\nHP8 8LH, C...",50.76306,Cowes,CO-473A9FBB,94,2008-06-08,Super Store,-1.29772,GB,Europe
437,"Täschestraße 25\n39039 Nördlingen, Kirchlengern",52.20000,Kirchlengern,KI-78096E8C,61,2005-05-12,Super Store,8.63333,DE,Europe
438,"Studio 8\nMoss mall\nWest Linda\nM0E 6XR, High...",51.62907,High Wycombe,HI-EEA7AE62,33,1998-05-14,Local,-0.74934,GB,Europe
439,"Baumplatz 6\n80114 Kötzting, Bretten",49.03685,Bretten,BR-662EC74C,35,2020-10-17,Local,8.70745,DE,Europe


In [101]:
mask = stores_df['staff_numbers'].str.contains('\d*[A-Za-z]+\d*', regex=True)
stores_df[mask]

Unnamed: 0,address,longitude,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
31,"Flat 69\nSuzanne walk\nEast Michelle\nE80 8HS,...",52.68333,East Dereham,EA-24B31935,J78,2012-11-09,Outlet,0.93333,GB,Europe
177,"Girschnerweg 163\n93597 Angermünde, Dahlem",52.4581,Dahlem,DA-ACC520AE,30e,1994-03-07,Local,13.28702,DE,Europe
244,"7430 Howe Extensions Suite 299\nKellyside, WA ...",39.71734,Sicklerville,SI-C489938D,80R,1994-02-28,Outlet,-74.96933,US,America
336,Studio 8\nLydia groves\nNorth Hilarymouth\nIV4...,50.79205,Southsea,SO-B5B9CB3B,A97,2018-05-08,Super Store,-1.08593,GB,Europe
370,"Salzstraße 1/9\n74209 Bad Freienwalde, Charlot...",52.53048,Charlottenburg-Nord,CH-99475026,3n9,1995-03-05,Local,13.29371,DE,Europe


In [111]:
stores_df['staff_numbers'] = pd.to_numeric(stores_df['staff_numbers'])

In [85]:
stores_df['longitude'] = pd.to_numeric(stores_df['longitude'])
stores_df['latitude'] = pd.to_numeric(stores_df['latitude'])


In [114]:
stores_df['store_code'] = stores_df['store_code'].astype('string')
stores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   address        440 non-null    object        
 1   longitude      440 non-null    float64       
 2   locality       440 non-null    object        
 3   store_code     441 non-null    string        
 4   staff_numbers  441 non-null    int64         
 5   opening_date   441 non-null    datetime64[ns]
 6   store_type     441 non-null    object        
 7   latitude       440 non-null    float64       
 8   country_code   440 non-null    object        
 9   continent      440 non-null    object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5), string(1)
memory usage: 34.6+ KB


In [119]:
stores_df['locality'].value_counts()

Chapletown       14
Belper           13
Bushey           12
Exeter           11
Arbroath         10
                 ..
Sicklerville      1
Brierley Hill     1
Searcy            1
Port Richmond     1
Westchester       1
Name: locality, Length: 116, dtype: int64

In [120]:
stores_df['locality'] = stores_df['locality'].astype('category')
stores_df['store_type'] = stores_df['store_type'].astype('category')
stores_df['country_code'] = stores_df['country_code'].astype('category')
stores_df['continent'] = stores_df['continent'].astype('category')

In [121]:
stores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   address        440 non-null    object        
 1   longitude      440 non-null    float64       
 2   locality       440 non-null    category      
 3   store_code     441 non-null    string        
 4   staff_numbers  441 non-null    int64         
 5   opening_date   441 non-null    datetime64[ns]
 6   store_type     441 non-null    category      
 7   latitude       440 non-null    float64       
 8   country_code   440 non-null    category      
 9   continent      440 non-null    category      
dtypes: category(4), datetime64[ns](1), float64(2), int64(1), object(1), string(1)
memory usage: 28.0+ KB


In [122]:
stores_df['store_type'].value_counts()

Local          255
Super Store     89
Mall Kiosk      51
Outlet          45
Web Portal       1
Name: store_type, dtype: int64

In [123]:
stores_df['country_code'].value_counts()

GB    265
DE    141
US     34
Name: country_code, dtype: int64

In [124]:
stores_df['continent'].value_counts()

Europe       382
America       32
eeEurope      24
eeAmerica      2
Name: continent, dtype: int64

In [128]:
stores_df['continent'] = stores_df['continent'].str.replace('eeEurope' , 'Europe').replace('eeAmerica' , 'America')
stores_df['continent'].value_counts()

Europe     406
America     34
Name: continent, dtype: int64