# `Sal.json` Processing 

In [13]:
import pandas as pd
import re

Previwe sal.json file

In [70]:
df = pd.read_json("../data/sal.json", orient="index")
df.reset_index(names="locat", inplace=True)
df.tail()

Unnamed: 0,locat,ste,gcc,sal
15335,christmas island,9,9oter,90001
15336,home island,9,9oter,90002
15337,jervis bay,9,9oter,90003
15338,norfolk island,9,9oter,90004
15339,west island,9,9oter,90005


In [71]:
# convert data type for each feature
df.ste = df.ste.astype("int8")
df.sal = df.ste.astype("int16")
df.dtypes

locat    object
ste        int8
gcc      object
sal       int16
dtype: object

In [72]:
# drop any rural sal value, this won't be use in the future
df = df[~df.gcc.str.contains(r"\dr[a-z]{3}")]
df.tail()

Unnamed: 0,locat,ste,gcc,sal
15335,christmas island,9,9oter,9
15336,home island,9,9oter,9
15337,jervis bay,9,9oter,9
15338,norfolk island,9,9oter,9
15339,west island,9,9oter,9


In [73]:
df.shape

(3394, 4)

In [74]:
# Count number of index with state identifier, remove brackets around state abbreviations.
df[df.locat.str.contains("[()]")]

Unnamed: 0,locat,ste,gcc,sal
1,abbotsford (nsw),1,1gsyd,1
7,alison (central coast - nsw),1,1gsyd,1
12,annandale (nsw),1,1gsyd,1
14,appin (nsw),1,1gsyd,1
15,arcadia (nsw),1,1gsyd,1
...,...,...,...,...
15311,reid (act),8,8acte,8
15316,spence (act),8,8acte,8
15317,stirling (act),8,8acte,8
15322,theodore (act),8,8acte,8


In [75]:
# replace the () with an empty string
df.locat = df.agg(lambda x: re.sub(r"[()]", "", x.locat), axis=1)
df[df.locat.str.contains("[()]")]

Unnamed: 0,locat,ste,gcc,sal


In [76]:
# count number of index contains ' - '
df[df.locat.str.contains(" - ")]

Unnamed: 0,locat,ste,gcc,sal
7,alison central coast - nsw,1,1gsyd,1
198,colo hawkesbury - nsw,1,1gsyd,1
230,darlington sydney - nsw,1,1gsyd,1
255,dural hornsby - nsw,1,1gsyd,1
274,elderslie camden - nsw,1,1gsyd,1
286,enmore inner west - nsw,1,1gsyd,1
343,green point central coast - nsw,1,1gsyd,1
346,greendale liverpool - nsw,1,1gsyd,1
433,kingswood penrith - nsw,1,1gsyd,1
459,lansdowne canterbury-bankstown - nsw,1,1gsyd,1


In [78]:
# replace " - " with " "
df.locat = df.agg(lambda x: re.sub(" - ", " ", x.locat), axis=1)
df[df.locat.str.contains(" - ")]

Unnamed: 0,locat,ste,gcc,sal


In [83]:
# count number of index contains "."
df[df.locat.str.contains("\.")]

Unnamed: 0,locat,ste,gcc,sal
4543,abbotsford vic.,2,2gmel,2
4548,albert park vic.,2,2gmel,2
4549,albion vic.,2,2gmel,2
4551,altona vic.,2,2gmel,2
4555,armadale vic.,2,2gmel,2
...,...,...,...,...
14215,rose bay tas.,6,6ghob,6
14220,sandford tas.,6,6ghob,6
14226,south arm tas.,6,6ghob,6
14232,tranmere tas.,6,6ghob,6


In [84]:
# replace "\." with ""
df.locat = df.agg(lambda x: re.sub("\.", "", x.locat), axis=1)
df[df.locat.str.contains("\.")]

Unnamed: 0,locat,ste,gcc,sal


In [85]:
df.value_counts("gcc")

gcc
1gsyd    920
3gbri    647
2gmel    572
4gade    494
5gper    397
8acte    137
6ghob    120
7gdar    102
9oter      5
dtype: int64

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3394 entries, 0 to 15339
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   locat   3394 non-null   object
 1   ste     3394 non-null   int8  
 2   gcc     3394 non-null   object
 3   sal     3394 non-null   int16 
dtypes: int16(1), int8(1), object(2)
memory usage: 89.5+ KB


In [89]:
df.to_parquet("../data/processed/sal.parquet")

In [90]:
df = pd.read_parquet("../data/processed/sal.parquet")
df.tail()

Unnamed: 0,locat,ste,gcc,sal
15335,christmas island,9,9oter,9
15336,home island,9,9oter,9
15337,jervis bay,9,9oter,9
15338,norfolk island,9,9oter,9
15339,west island,9,9oter,9


In [92]:
from pathlib import Path
Path("./data/sal.json").parent

PosixPath('data')