# The following code is for external data processing

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("MAST30034 Project 2 Preprocessing")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.parquet.writeLegacyFormat", 'true')
    .getOrCreate()
)

22/09/05 21:14:55 WARN Utils: Your hostname, DESKTOP-3NQ3PQI resolves to a loopback address: 127.0.1.1; using 172.21.146.149 instead (on interface eth0)
22/09/05 21:14:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/05 21:14:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import re
from pyspark.sql import functions as F
import numpy as np
import pandas as pd

In [5]:
external_sdf = spark.read.option("header", "true").csv("../data/raw/external/income.csv")
external_sdf= external_sdf.withColumnRenamed(
    "INCP Total Personal Income (weekly)",
    "Income"
)

                                                                                

In [6]:
external_sdf

Counting,SA2 (UR),Income,Count
Persons Place of ...,Braidwood,Negative income,17
Persons Place of ...,Braidwood,Nil income,213
Persons Place of ...,Braidwood,"$1-$149 ($1-$7,799)",110
Persons Place of ...,Braidwood,"$150-$299 ($7,800...",224
Persons Place of ...,Braidwood,"$300-$399 ($15,60...",323
Persons Place of ...,Braidwood,"$400-$499 ($20,80...",333
Persons Place of ...,Braidwood,"$500-$649 ($26,00...",249
Persons Place of ...,Braidwood,"$650-$799 ($33,80...",244
Persons Place of ...,Braidwood,"$800-$999 ($41,60...",221
Persons Place of ...,Braidwood,"$1,000-$1,249 ($5...",256


In [85]:
income_factors = list(set(external_sdf.select(F.collect_list("Income")).first()[0]))

In [86]:
income_factors

['$150-$299 ($7,800-$15,599)',
 '$1,750-$1,999 ($91,000-$103,999)',
 'Total',
 '$1,250-$1,499 ($65,000-$77,999)',
 '$400-$499 ($20,800-$25,999)',
 'Not stated',
 'Not applicable',
 '$2,000-$2,999 ($104,000-$155,999)',
 '$650-$799 ($33,800-$41,599)',
 '$1-$149 ($1-$7,799)',
 '$800-$999 ($41,600-$51,999)',
 '$1,500-$1,749 ($78,000-$90,999)',
 '$3,000 or more ($156,000 or more)',
 '$500-$649 ($26,000-$33,799)',
 'Nil income',
 'Negative income',
 '$1,000-$1,249 ($52,000-$64,999)',
 '$300-$399 ($15,600-$20,799)']

As we can see, there seems to be a row per location regarding the total amount of 'Count'. We wish to extract this information and create a separate dataset for easier access to these numbers

In [87]:
location_total = external_sdf.filter(F.col("Income") == "Total")

In [88]:
location_total

Counting,SA2 (UR),Income,Count
Persons Place of ...,Braidwood,Total,3875
Persons Place of ...,Karabar,Total,8241
Persons Place of ...,Queanbeyan,Total,10838
Persons Place of ...,Queanbeyan - East,Total,4785
Persons Place of ...,Queanbeyan Region,Total,16945
Persons Place of ...,Queanbeyan West -...,Total,12642
Persons Place of ...,Bombala,Total,2391
Persons Place of ...,Cooma,Total,6737
Persons Place of ...,Cooma Region,Total,3282
Persons Place of ...,Jindabyne - Berri...,Total,6953


In [89]:
external_sdf = external_sdf.where(F.col("Income") != "Total")

# Use regular expression to find amount range

In [91]:
temp_df = external_sdf.select('Income').toPandas()

In [141]:
from readline import append_history_file


output_col = []
pattern = "\((\$\d*,?\d+-\$\d*,*\d*)|(\$\d*,?\d* or more)\)"

for income in temp_df["Income"]:
    matched = re.findall(pattern, income)
    if len(matched) > 0:
        output_col.append(matched[0][0])
    else:
        output_col.append(income)

In [142]:
temp_df['Income Parsed'] = output_col

In [147]:
output_col[14]

''

In [143]:
test = spark.createDataFrame(temp_df)
test

Income,Income Parsed
Negative income,Negative income
Nil income,Nil income
"$1-$149 ($1-$7,799)","$1-$7,799"
"$150-$299 ($7,800...","$7,800-$15,599"
"$300-$399 ($15,60...","$15,600-$20,799"
"$400-$499 ($20,80...","$20,800-$25,999"
"$500-$649 ($26,00...","$26,000-$33,799"
"$650-$799 ($33,80...","$33,800-$41,599"
"$800-$999 ($41,60...","$41,600-$51,999"
"$1,000-$1,249 ($5...","$52,000-$64,999"


Ben's preprocessing

In [46]:
income_df = pd.read_csv('../data/raw/external/income.csv')
income_df = income_df.drop(index=range(len(income_df) - 4, len(income_df)))

In [47]:
income_df = income_df.rename(columns = {"SA2 (UR)" : "Region", "INCP Total Personal Income (weekly)" : "Income"})

In [48]:
def convert_income(row):
    if row['Income'] in ['Negative income', 'Nil income', 'Total', 'Not stated', 'Not applicable']:
        row['weekly_income'] = row['Income']
        row['yearly_income'] = row['Income']
    else:
        matches = re.findall('(.*)\s\((.*)\)', row['Income'])
        row['weekly_income'] = matches[0][0]
        row['yearly_income'] = matches[0][1]
    return row

In [49]:
income_df = income_df.apply(convert_income, axis = 1)

    

In [50]:
income_df

Unnamed: 0,Counting,Region,Income,Count,weekly_income,yearly_income
0,Persons Place of Usual Residence,Braidwood,Negative income,17.0,Negative income,Negative income
1,Persons Place of Usual Residence,Braidwood,Nil income,213.0,Nil income,Nil income
2,Persons Place of Usual Residence,Braidwood,"$1-$149 ($1-$7,799)",110.0,$1-$149,"$1-$7,799"
3,Persons Place of Usual Residence,Braidwood,"$150-$299 ($7,800-$15,599)",224.0,$150-$299,"$7,800-$15,599"
4,Persons Place of Usual Residence,Braidwood,"$300-$399 ($15,600-$20,799)",323.0,$300-$399,"$15,600-$20,799"
...,...,...,...,...,...,...
41593,Persons Place of Usual Residence,Total,"$2,000-$2,999 ($104,000-$155,999)",961768.0,"$2,000-$2,999","$104,000-$155,999"
41594,Persons Place of Usual Residence,Total,"$3,000 or more ($156,000 or more)",596531.0,"$3,000 or more","$156,000 or more"
41595,Persons Place of Usual Residence,Total,Not stated,1706737.0,Not stated,Not stated
41596,Persons Place of Usual Residence,Total,Not applicable,4364610.0,Not applicable,Not applicable
