# Data Transformation

## Setup

In [1]:
# Setting up execution path
import os

print(f"Current working directory: {os.path.basename(os.getcwd())}")

# Change to root directory
os.chdir("../")
print(f"Current working directory (Changed): {os.path.basename(os.getcwd())}")

Current working directory: notebooks
Current working directory (Changed): Analyzing-Cleanway-Services


In [2]:
from rich import print

In [3]:
from src.constants import CONFIGS
from src.utils.basic_utils import read_yaml

In [4]:
configs = read_yaml(CONFIGS).data_preprocessor
print(configs.to_dict())

[2024-03-11 08:19:57 PM]:ProjectLogger INFO:basic_utils 40 - yaml file: conf\configs.yaml loaded successfully


In [5]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# module setup
%matplotlib inline
pd.options.display.precision = 5
warnings.filterwarnings("ignore")

In [6]:
# Get the CSV filepath from configs
scraped_data_path = configs.scraped_data_path

# Read the CSV data
df_main = pd.read_csv(scraped_data_path, index_col=False)

# create a copy of it for working
df = df_main.copy(deep=True)

# View a glimpse of the data
df.head()

Unnamed: 0,service_name,address,latitude,longitude,services_offered,details_url,scrape_ts
0,Bromelton Energy & Resource Centre,"Lot 1 Beaudesert-Boonah Road, Bromelton QLD",-27.975059190727382,152.92509,Miscellaneous,https://www.cleanaway.com.au/location/bromelto...,2024-03-10 23:51:34
1,Albany Transfer Station & MRF,"2-16 Cuming Road & 37 Maxwell Street, Albany, ...",-35.01078332713777,117.86332,"Solid waste services, Office",https://www.cleanaway.com.au/location/albany/,2024-03-10 23:51:38
2,Albury-Wodonga Solid Waste Services,"26 Reiff St, Lavington, NSW, 2641",-36.037913,146.96458,Solid waste services,https://www.cleanaway.com.au/location/lavington/,2024-03-10 23:51:44
3,Alexandra Solid Waste Services,"4-8 Station Street, Alexandra, VIC, 3714",-37.183192,145.71203,Solid waste services,https://www.cleanaway.com.au/location/alexandra/,2024-03-10 23:51:52
4,Cleanaway Alice Springs Solid Waste Depot,"6 Wilkinson Street, Ciccone, NT, 0870",-23.700021,133.87134,Solid waste services,https://www.cleanaway.com.au/location/alice-sp...,2024-03-10 23:52:00


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   service_name      215 non-null    object 
 1   address           215 non-null    object 
 2   latitude          215 non-null    object 
 3   longitude         214 non-null    float64
 4   services_offered  214 non-null    object 
 5   details_url       215 non-null    object 
 6   scrape_ts         215 non-null    object 
dtypes: float64(1), object(6)
memory usage: 11.9+ KB


In [8]:
df[df["longitude"].isna()]

Unnamed: 0,service_name,address,latitude,longitude,services_offered,details_url,scrape_ts
135,Cleanaway Narangba Vacuum Truck Liquid Waste S...,"109 Potassium Street, Narangba, QLD, 4504",-27.206933789931885,,,https://www.cleanaway.com.au/location/narangba/,2024-03-11 00:04:18


In [9]:
import httpx

In [10]:
def get_location_name(address):
    url = f"https://nominatim.openstreetmap.org/?q={address}&format=json"
    response = httpx.get(url)
    if response.status_code == 200:
        data = response.json()
        return {"lat": float(data[0]["lat"]), "long": float(data[0]["lon"])}
    else:
        return "Error: Unable to retrieve location information"

In [11]:
get_location_name("109 Potassium Street, Narangba, QLD, 4504")

[2024-03-11 08:20:01 PM]:httpx INFO:_client 1026 - HTTP Request: GET https://nominatim.openstreetmap.org/?q=109%20Potassium%20Street,%20Narangba,%20QLD,%204504&format=json "HTTP/1.1 200 OK"


{'lat': -27.2005386, 'long': 152.9885253}

In [12]:
def apply_function(row):
    if pd.isnull(row["latitude"]) or pd.isnull(row["longitude"]):
        coordinates = get_location_name(row["address"])
        latitude, longitude = coordinates["lat"], coordinates["long"]
        return latitude, longitude
    else:
        return row["latitude"], row["longitude"]

In [13]:
df["latitude"], df["longitude"] = zip(*df.apply(apply_function, axis=1))

[2024-03-11 08:20:02 PM]:httpx INFO:_client 1026 - HTTP Request: GET https://nominatim.openstreetmap.org/?q=109%20Potassium%20Street,%20Narangba,%20QLD,%204504&format=json "HTTP/1.1 200 OK"


In [14]:
df[df["services_offered"].isna()]

Unnamed: 0,service_name,address,latitude,longitude,services_offered,details_url,scrape_ts
135,Cleanaway Narangba Vacuum Truck Liquid Waste S...,"109 Potassium Street, Narangba, QLD, 4504",-27.20054,152.98853,,https://www.cleanaway.com.au/location/narangba/,2024-03-11 00:04:18


In [15]:
df[df["longitude"].isna()]

Unnamed: 0,service_name,address,latitude,longitude,services_offered,details_url,scrape_ts


In [18]:
df["address"]

0            Lot 1 Beaudesert-Boonah Road, Bromelton QLD
1      2-16 Cuming Road & 37 Maxwell Street, Albany, ...
2                      26 Reiff St, Lavington, NSW, 2641
3               4-8 Station Street, Alexandra, VIC, 3714
4                  6 Wilkinson Street, Ciccone, NT, 0870
                             ...                        
210                1 Amsterdam Circuit, Wyong, NSW, 2259
211                      29 Binary St, Yatala, QLD, 4207
212                20 Chillingworks Rd, Young, NSW, 2594
213                510 Summerhill Road, Wollert VIC 3750
214        Old Power Station Site, Wallerawang, NSW 2845
Name: address, Length: 215, dtype: object

In [27]:
def get_location_name(latitude, longitude):
    url = f"https://nominatim.openstreetmap.org/reverse?lat={latitude}&lon={longitude}&format=json"
    response = httpx.get(url)
    if response.status_code == 200:
        data_address = response.json()["address"]
        return (data_address.get("state"), data_address.get("postcode"))
    else:
        return "Error: Unable to retrieve location information"

In [28]:
df["state"], df["postcode"] = zip(
    *df.apply(lambda row: get_location_name(row["latitude"], row["longitude"]), axis=1)
)

[2024-03-11 08:49:54 PM]:httpx INFO:_client 1026 - HTTP Request: GET https://nominatim.openstreetmap.org/reverse?lat=-27.975059190727386&lon=152.92509487274305&format=json "HTTP/1.1 200 OK"
[2024-03-11 08:49:55 PM]:httpx INFO:_client 1026 - HTTP Request: GET https://nominatim.openstreetmap.org/reverse?lat=-35.01078332713777&lon=117.8633163813148&format=json "HTTP/1.1 200 OK"
[2024-03-11 08:49:56 PM]:httpx INFO:_client 1026 - HTTP Request: GET https://nominatim.openstreetmap.org/reverse?lat=-36.037913&lon=146.964581&format=json "HTTP/1.1 200 OK"
[2024-03-11 08:49:57 PM]:httpx INFO:_client 1026 - HTTP Request: GET https://nominatim.openstreetmap.org/reverse?lat=-37.183192&lon=145.712031&format=json "HTTP/1.1 200 OK"
[2024-03-11 08:49:57 PM]:httpx INFO:_client 1026 - HTTP Request: GET https://nominatim.openstreetmap.org/reverse?lat=-23.700021&lon=133.871345&format=json "HTTP/1.1 200 OK"
[2024-03-11 08:49:58 PM]:httpx INFO:_client 1026 - HTTP Request: GET https://nominatim.openstreetmap.or

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   service_name      215 non-null    object 
 1   address           215 non-null    object 
 2   latitude          215 non-null    object 
 3   longitude         215 non-null    float64
 4   services_offered  214 non-null    object 
 5   details_url       215 non-null    object 
 6   scrape_ts         215 non-null    object 
 7   state             209 non-null    object 
 8   postcode          194 non-null    object 
dtypes: float64(1), object(8)
memory usage: 15.2+ KB
