In [25]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import pickle

In [12]:
# this code is adapted from https://github.com/rNLKJA/Unimelb-Master-2023-COMP90024-Assignment-1/blob/main/main.py
# changes are required due to no need for polars dataframe
# remove logger dependencies
def split_location_into_ngrams(location):
    words = location.split(" ")
    if len(words) > 2:
        return [" ".join(x) for x in zip(words, words[1:])]
    return None

def sal_processor(path: Path) -> pd.DataFrame:
    """
    Process sal.json file by removing irrelevant attributes,
    case 0: remove any gcc containing char r (r represents rural)
    case 1: remove all brackets
    case 2: remove all " - "
    case 3: remove all "\."
    Then store the final result into a csv file.

    path (Path): root directory
    """
    # load sal.json file & reset index
    df = pd.read_json(path, orient="index")
    df = df.reset_index().rename(columns={"index": "location"})

    df.drop(["ste", "sal"], axis=1, inplace=True)

    # case1: replace all brackets with an empty string
    # logger.info("Substitute brackets in location")
    df.location = df.agg(lambda x: re.sub(r"[()]", "", x.location), axis=1)

    # case2: replace " - " with " "
    # logger.info("Substitude string ' - ' with ' '")
    df.location = df.agg(lambda x: re.sub(" - ", " ", x.location), axis=1)

    # case3: replace "\." with ""
    # logger.info("Substitude \. with an empty string")
    df.location = df.agg(lambda x: re.sub("\.", "", x.location), axis=1)

    # case 4: consider ngram like locations
    # generate a new dataframe and concate to the original one
    df1 = df.copy()
    df1["location"] = df1.location.apply(lambda x: split_location_into_ngrams(x))
    df1 = df1.dropna()[["location", "gcc"]].explode("location")

    # concat the final result
    res_df = pd.concat([df, df1], ignore_index=True, axis=0)

    return res_df

In [13]:
sal_json_path = Path('../data/sal.json')

In [20]:
sal_json_path.resolve().parent

PosixPath('/mnt/g/Unimelb-Master-2023-COMP90024-Assignment-2/4_Python_data_processing/data')

In [23]:
sal_df = sal_processor(sal_json_path)

In [24]:
sal_df.to_pickle(sal_json_path.resolve().parent / 'sal.pickle')
sal_df.to_pickle(sal_json_path.resolve().parent / 'sal.processed.json')

In [26]:
sal_dict = dict(zip(sal_df["location"].to_list(), sal_df["gcc"].to_list()))
sal_dict

{'abbotsbury': '1gsyd',
 'abbotsford nsw': '1gsyd',
 'acacia gardens': '1gsyd',
 'agnes banks': '1gsyd',
 'airds': '1gsyd',
 'alexandria': '1gsyd',
 'alfords point': '1gsyd',
 'alison central coast nsw': '1gsyd',
 'allambie heights': '1gsyd',
 'allawah': '1gsyd',
 'ambarvale': '1gsyd',
 'angus': '1gsyd',
 'annandale nsw': '1gsyd',
 'annangrove': '1gsyd',
 'appin nsw': '1gsyd',
 'arcadia nsw': '1gsyd',
 'arncliffe': '1gsyd',
 'arndell park': '1gsyd',
 'artarmon': '1gsyd',
 'ashbury': '1gsyd',
 'ashcroft': '1gsyd',
 'ashfield nsw': '1gsyd',
 'asquith': '1gsyd',
 'auburn nsw': '1gsyd',
 'austral': '1gsyd',
 'avalon beach': '1gsyd',
 'avoca beach': '1gsyd',
 'badgerys creek': '1gsyd',
 'balgowlah': '1gsyd',
 'balgowlah heights': '1gsyd',
 'balmain': '1gsyd',
 'balmain east': '1gsyd',
 'bangor nsw': '1gsyd',
 'banksia': '1gsyd',
 'banksmeadow': '1gsyd',
 'bankstown': '1gsyd',
 'bankstown aerodrome': '1gsyd',
 'bar point': '1gsyd',
 'barangaroo': '1gsyd',
 'barden ridge': '1gsyd',
 'bardia':

In [27]:
with open(sal_json_path.resolve().parent/'sal.processed.dict.pkl', 'wb') as f:
    pickle.dump(sal_dict, f)