In [1]:
import pandas as pd
from random import randint

# Data sources
#
#   - Inhabitants' median disposable monetary income by postal code area, 2010-2021
#     https://pxdata.stat.fi:443/PxWeb/sq/39625562-d250-492a-a190-37bcc355e2a3
#
#   - Prices per square meter of old dwellings in housing companies and numbers
#     of transactions by postal code area, yearly, 2009-2022
#     https://pxdata.stat.fi:443/PxWeb/sq/41826b15-82a9-4c83-8be6-bd77f98b31ac
#

# Read in what we've got.
incomes = pd.read_csv('data/003_12f1_2021_20230929-100110.csv', encoding = 'latin1', skiprows = [0, 1])
prices = pd.read_csv('data/001_13mu_2022_20230929-105546.csv', encoding = 'latin1', skiprows = [0, 1])

In [2]:
# A few peeks in to the data we just read.

print(f'Income/Price data shape: {incomes.shape}/{prices.shape}\n')

print(incomes.iloc[[randint(0, incomes.shape[0])]].to_string())
print(prices.iloc[[randint(0, prices.shape[0])]].to_string())

Income/Price data shape: (84, 14)/(81, 17)

              Postal code area                        Information   2010   2011   2012   2013   2014   2015   2016   2017   2018   2019   2020   2021
57  00730  Tapanila (Helsinki)  Median income of inhabitants (HR)  23211  23622  24321  24649  24731  24966  25246  25673  26319  27123  27223  28064
                  Postal code                   Building type                      Information  2009  2010  2011  2012  2013  2014  2015  2016  2017  2018  2019  2020  2021  2022
46  00630  Maunula (Helsinki)  Blocks of flats, two-room flat  Price per square meter (EUR/m2)  2352  2545  2670  3054  3182  3271  3346  3451  3608  3825  3823  4056  4038  4144


In [3]:
# Drop unwanted columns for ease of use later on.
incomes.drop('Information', axis = 1, inplace = True)
prices.drop([ 'Building type', 'Information' ], axis = 1, inplace = True)

# Since we do not have income data for years 2009 and 2022, we don't need pricing data for those either.
prices.drop([ '2009', '2022' ], axis = 1, inplace = True)

# Ensure column names aren't fluctuating wildly like a warp drive engine.
incomes.rename(columns = { 'Postal code area': 'Postal code' }, inplace = True)

In [4]:
# A few more peeks into the data, which should now appear more uniform than before.

print(f'Income/Price data shape: {incomes.shape}/{prices.shape}\n')

code = incomes.iloc[[randint(0, min(incomes.shape[0], prices.shape[0]))]]['Postal code'].values[0]
print(incomes.loc[incomes['Postal code'] == code])
print(prices.loc[prices['Postal code'] == code])

Income/Price data shape: (84, 13)/(81, 13)

                    Postal code   2010   2011   2012   2013   2014   2015  \
51  00670  Paloheinä (Helsinki)  28460  29110  30303  30139  30416  31147   

     2016   2017   2018   2019   2020   2021  
51  31431  32090  33186  34225  34005  35431  
                    Postal code 2010 2011 2012 2013 2014 2015 2016 2017 2018  \
50  00670  Paloheinä (Helsinki)    .    .  ...  ...  ...    .  ...    .  ...   

   2019 2020 2021  
50  ...  ...    .  
