In [11]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

In [12]:
# this code is adapted from https://github.com/rNLKJA/Unimelb-Master-2023-COMP90024-Assignment-1/blob/main/main.py
# changes are required due to no need for polars dataframe
# remove logger dependencies
def split_location_into_ngrams(location):
    words = location.split(" ")
    if len(words) > 2:
        return [" ".join(x) for x in zip(words, words[1:])]
    return None

def sal_processor(path: Path) -> pd.DataFrame:
    """
    Process sal.json file by removing irrelevant attributes,
    case 0: remove any gcc containing char r (r represents rural)
    case 1: remove all brackets
    case 2: remove all " - "
    case 3: remove all "\."
    Then store the final result into a csv file.

    path (Path): root directory
    """
    # load sal.json file & reset index
    df = pd.read_json(path, orient="index")
    df = df.reset_index().rename(columns={"index": "location"})

    df.drop(["ste", "sal"], axis=1, inplace=True)

    # case1: replace all brackets with an empty string
    # logger.info("Substitute brackets in location")
    df.location = df.agg(lambda x: re.sub(r"[()]", "", x.location), axis=1)

    # case2: replace " - " with " "
    # logger.info("Substitude string ' - ' with ' '")
    df.location = df.agg(lambda x: re.sub(" - ", " ", x.location), axis=1)

    # case3: replace "\." with ""
    # logger.info("Substitude \. with an empty string")
    df.location = df.agg(lambda x: re.sub("\.", "", x.location), axis=1)

    # case 4: consider ngram like locations
    # generate a new dataframe and concate to the original one
    df1 = df.copy()
    df1["location"] = df1.location.apply(lambda x: split_location_into_ngrams(x))
    df1 = df1.dropna()[["location", "gcc"]].explode("location")

    # concat the final result
    res_df = pd.concat([df, df1], ignore_index=True, axis=0)

    return res_df

In [13]:
sal_json_path = Path('../data/sal.json')

In [20]:
sal_json_path.resolve().parent

PosixPath('/mnt/g/Unimelb-Master-2023-COMP90024-Assignment-2/4_Python_data_processing/data')

In [21]:
sal_processor(sal_json_path).to_pickle(sal_json_path.resolve().parent / 'sal.pickle')
sal_processor(sal_json_path).to_pickle(sal_json_path.resolve().parent / 'sal.processed.json')