In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import pickle

In [27]:
sal_json_path = Path('../data/sal.json')

In [50]:
# this code is adapted from https://github.com/rNLKJA/Unimelb-Master-2023-COMP90024-Assignment-1/blob/main/main.py
# changes are required due to no need for polars dataframe
# remove logger dependencies
def split_location_into_ngrams(location):
    words = location.split(" ")
    if len(words) > 2:
        return [" ".join(x) for x in zip(words, words[1:])]
    return None

def sal_processor(path: Path) -> pd.DataFrame:
    """
    Process sal.json file by removing irrelevant attributes,
    case 0: remove any gcc containing char r (r represents rural)
    case 1: remove all brackets
    case 2: remove all " - "
    case 3: remove all "\."
    Then store the final result into a csv file.

    path (Path): root directory
    """
    # load sal.json file & reset index
    df = pd.read_json(path, orient="index")
    df = df.reset_index().rename(columns={"index": "location"})

    df.drop(["ste"], axis=1, inplace=True)

    # case1: replace all brackets with an empty string
    # logger.info("Substitute brackets in location")
    df.location = df.agg(lambda x: re.sub(r"[()]", "", x.location), axis=1)

    # case2: replace " - " with " "
    # logger.info("Substitude string ' - ' with ' '")
    df.location = df.agg(lambda x: re.sub(" - ", " ", x.location), axis=1)

    # case3: replace "\." with ""
    # logger.info("Substitude \. with an empty string")
    df.location = df.agg(lambda x: re.sub("\.", "", x.location), axis=1)
    df['real_loc'] = df.location
        
    # case 4: consider ngram like locations
    # generate a new dataframe and concate to the original one
    df1 = df.copy()

    df1["location"] = df1.location.apply(lambda x: split_location_into_ngrams(x))

    df1 = df1.dropna()[["location", "gcc", 'sal', 'real_loc']].explode("location")

    # concat the final result
    res_df = pd.concat([df, df1], ignore_index=True, axis=0)

    return res_df

In [51]:
sal_df = sal_processor(sal_json_path)
sal_df.dropna()

Unnamed: 0,location,gcc,sal,real_loc
0,abbotsbury,1gsyd,10002,abbotsbury
1,abbotsford nsw,1gsyd,10003,abbotsford nsw
2,acacia gardens,1gsyd,10014,acacia gardens
3,agnes banks,1gsyd,10021,agnes banks
4,airds,1gsyd,10022,airds
...,...,...,...,...
17647,creek act,8acte,80013,rendezvous creek act
17648,weston creek,8acte,80016,weston creek act
17649,creek act,8acte,80016,weston creek act
17650,red hill,8acte,80112,red hill act


In [52]:
sal_df.to_pickle(sal_json_path.resolve().parent / 'sal.pickle')
sal_df.to_pickle(sal_json_path.resolve().parent / 'sal.processed.json')

In [54]:
sal_dict = dict(zip(sal_df["location"].to_list(), sal_df["sal"].to_list()))
sal_dict

{'abbotsbury': 10002,
 'abbotsford nsw': 10003,
 'acacia gardens': 10014,
 'agnes banks': 10021,
 'airds': 10022,
 'alexandria': 10030,
 'alfords point': 10031,
 'alison central coast nsw': 10034,
 'allambie heights': 10036,
 'allawah': 10038,
 'ambarvale': 10049,
 'angus': 10060,
 'annandale nsw': 10062,
 'annangrove': 10063,
 'appin nsw': 10064,
 'arcadia nsw': 10073,
 'arncliffe': 10087,
 'arndell park': 10088,
 'artarmon': 10091,
 'ashbury': 10094,
 'ashcroft': 10098,
 'ashfield nsw': 10099,
 'asquith': 10104,
 'auburn nsw': 10107,
 'austral': 10110,
 'avalon beach': 10112,
 'avoca beach': 10115,
 'badgerys creek': 10133,
 'balgowlah': 10154,
 'balgowlah heights': 10155,
 'balmain': 10165,
 'balmain east': 10166,
 'bangor nsw': 10178,
 'banksia': 10179,
 'banksmeadow': 10180,
 'bankstown': 10181,
 'bankstown aerodrome': 10182,
 'bar point': 10189,
 'barangaroo': 10192,
 'barden ridge': 10194,
 'bardia': 10195,
 'bardwell park': 10196,
 'bardwell valley': 10197,
 'bargo': 10199,
 'b

In [55]:
with open(sal_json_path.resolve().parent/'sal.processed.dict.pkl', 'wb') as f:
    pickle.dump(sal_dict, f)