# GTHA housing market database
# OSEMN methodology Step 1: Obtain
# Obtain Postal Geography to census conversion
# from Platinum Postal Code Suite from DMTI

---

This notebook describes _Step 1: Obtain_ of OSEMN methodology, the process of obtaining Postal Geography to census conversion from the Platinum Postal Code Suite created by DMTI Spatial Inc.


Files used in this notebook are downloaded from the [Map and Data Library database](https://mdl.library.utoronto.ca/collections/geospatial-data/platinum-postal-code-suite-4) of the University of Toronto.

---

For description of OSEMN methodology, see `methodology/0.osemn/osemn.pdf`.

## Import dependencies

In [2]:
import pandas as pd
import os
from time import time
from simpledbf import Dbf5

In [3]:
data_path = '../../data/dmti/'
os.listdir(data_path)

['CANp2c.zip', 'CANfsa.zip', 'CANp2c', 'ONldu', 'ONldu.zip', 'CANfsa']

In [5]:
canp2c_path = data_path + 'CANp2c/'
os.listdir(canp2c_path)

['CANp2c.dbf.xml',
 'CANp2c_retired.htm',
 'CANp2c.dbf',
 'CANp2c.htm',
 'CANp2c_retired.dbf.xml',
 'CANp2c_retired.dbf']

## Load Postal Geography to census conversion

In [6]:
t = time()
canp2c_dbf = Dbf5(canp2c_path + 'CANp2c.dbf')
canp2c_df = canp2c_dbf.to_dataframe()
elapsed = time() - t

print("----- DataFrame loaded"
      "\nin {0:.2f} seconds ({1:.2f} minutes)".format(elapsed, elapsed / 60) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(canp2c_df.shape[0], canp2c_df.shape[1]) + 
      "\n-- Column names:\n", canp2c_df.columns)

----- DataFrame loaded
in 58.22 seconds (0.97 minutes)
with 1,944,698 rows
and 18 columns
-- Column names:
 Index(['MEP_ID', 'PRFEDEA_96', 'PRCDDA_01', 'PRCDDA_06', 'PRCDDA_11',
       'CMACT_96', 'CTNAME_01', 'CTNAME_06', 'CTNAME_11', 'PRCDCSD_96',
       'PRCDCSD_01', 'PRCDCSD_06', 'PRCDCSD_11', 'PRCD_96', 'PRCD_01',
       'PRCD_06', 'PRCD_11', 'PROV'],
      dtype='object')


In [8]:
canp2c_df['PROV'].value_counts()

ON    678218
QC    546792
BC    253887
AB    179318
NB     92113
NS     82442
MB     39484
SK     35882
NL     20800
PE     12755
YT      1903
NT      1066
NU        38
Name: PROV, dtype: int64

In [10]:
onp2c_df = canp2c_df.query('PROV == "ON"')
print("{0:,} records in the subset.".format(len(onp2c_df)))

678,218 records in the subset.


In [12]:
onp2c_df.head()

Unnamed: 0,MEP_ID,PRFEDEA_96,PRCDDA_01,PRCDDA_06,PRCDDA_11,CMACT_96,CTNAME_01,CTNAME_06,CTNAME_11,PRCDCSD_96,PRCDCSD_01,PRCDCSD_06,PRCDCSD_11,PRCD_96,PRCD_01,PRCD_06,PRCD_11,PROV
5651,7956353,35056253,35180367,35180367,35180367,5350812.0,5350812.0,5350812.0,5350812.0,3518005,3518005,3518005,3518005,3518,3518,3518,3518,ON
5652,7956354,35056104,35180591,35180591,35180591,5350805.01,5350805.04,5350805.04,5350805.04,3518005,3518005,3518005,3518005,3518,3518,3518,3518,ON
5653,7956355,35056458,35180355,35180887,35180887,5350806.0,5350805.07,5350805.08,5350805.08,3518005,3518005,3518005,3518005,3518,3518,3518,3518,ON
5654,7956356,35056104,35180591,35180591,35180591,5350805.01,5350805.04,5350805.04,5350805.04,3518005,3518005,3518005,3518005,3518,3518,3518,3518,ON
5655,7956358,35079106,35430312,35430827,35430827,5350485.0,5350485.02,5350485.02,5350485.02,3543007,3543007,3543007,3543007,3543,3543,3543,3543,ON


## Save results to a .csv file

In [11]:
save_path = data_path + 'onp2c.csv'
t = time()
onp2c_df.to_csv(save_path, index=False)
elapsed = time() - t
print("DataFrame saved to file:\n", save_path,
      "\ntook {0:.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))

DataFrame saved to file:
 ../../data/dmti/onp2c.csv 
took 7.16 seconds (0.12 minutes)
