# GTHA housing market database
# OSEMN methodology Step 1: Obtain
# Obtain Postal Geography
# from Platinum Postal Code Suite from DMTI

---

This notebook describes _Step 1: Obtain_ of OSEMN methodology, the process of obtaining of Postal Geography from the Platinum Postal Code Suite created by DMTI Spatial Inc.


Files used in this notebook are downloaded from the [Map and Data Library database](https://mdl.library.utoronto.ca/collections/geospatial-data/platinum-postal-code-suite-4) of the University of Toronto.

---

For description of OSEMN methodology, see `methodology/0.osemn/osemn.pdf`.

## Import dependencies

In [18]:
%matplotlib inline
import pandas as pd 
import geopandas as gpd
import os
from time import time

In [19]:
dmti_pg_path = '../../data/dmti/ONldu/'
os.listdir(dmti_pg_path)

['ONldu.sbx',
 'ONldu.lyr',
 'ONldu.shp.xml',
 'ONldu.sbn',
 'ONldu.htm',
 'ONldu.shp',
 'ONldu.prj',
 '.ipynb_checkpoints',
 'ONldu.dbf',
 'nodup',
 'ONldu.shx']

## Load Postal Geometry for Ontario

In [20]:
t = time()
ldu_gdf = gpd.read_file(dmti_pg_path + 'ONldu.shp')
elapsed = time() - t

print("----- GeoDataFrame loaded"
      "\nin {0:.2f} seconds ({1:.2f} minutes)".format(elapsed, elapsed / 60) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(ldu_gdf.shape[0], ldu_gdf.shape[1]) + 
      "\n-- Column names:\n", ldu_gdf.columns)

# ldu_gdf.plot(); # map takes several minutes to plot

----- GeoDataFrame loaded
in 52.79 seconds (0.88 minutes)
with 555,697 rows
and 12 columns
-- Column names:
 Index(['PCA_ID', 'POSTALCODE', 'PROV', 'MAF_ID', 'PREC_CODE', 'PCA_COUNT',
       'DOM_PCA', 'MULTI_PC', 'DEL_M_ID', 'LONGITUDE', 'LATITUDE', 'geometry'],
      dtype='object')


## Remove duplicated 'PCA_ID'

### Duplicated records

In [21]:
mask1 = ldu_gdf['PCA_ID'].duplicated(keep=False)
ldu_gdf[mask1].sort_values('PCA_ID')

Unnamed: 0,PCA_ID,POSTALCODE,PROV,MAF_ID,PREC_CODE,PCA_COUNT,DOM_PCA,MULTI_PC,DEL_M_ID,LONGITUDE,LATITUDE,geometry
542177,180005777.0,N0H2R0,ON,350002318.0,1.0,244.0,1.0,0.0,LB0001,-81.679604,45.222073,POLYGON ((-81.67335698699992 45.25811306900005...
143664,180005777.0,N0H2R0,ON,350002318.0,1.0,244.0,1.0,0.0,LB0001,-81.679604,45.222073,POLYGON ((-81.70408278799994 45.23358081600009...
175571,180038952.0,K2G5E8,ON,350002120.0,3.0,2.0,1.0,0.0,LC0044,-75.763223,45.340997,POLYGON ((-75.76292857399994 45.34112117300003...
175570,180038952.0,K2G5E8,ON,350002120.0,3.0,2.0,1.0,0.0,LC0044,-75.763223,45.340997,POLYGON ((-75.76327538099991 45.34069477100007...
195504,180059274.0,L8J2N6,ON,350002220.0,2.0,2.0,0.0,0.0,LC0095,-79.792537,43.198196,POLYGON ((-79.79284299999995 43.19816200000008...
195272,180059274.0,L8J2N6,ON,350002220.0,2.0,2.0,0.0,0.0,LC0095,-79.792537,43.198196,"POLYGON ((-79.7933700989999 43.19829254700005,..."
199613,180063479.0,L8E3J2,ON,350002220.0,3.0,3.0,0.0,0.0,LC0028,-79.759786,43.233758,POLYGON ((-79.75968799999993 43.23353300100007...
199612,180063479.0,L8E3J2,ON,350002220.0,3.0,3.0,0.0,0.0,LC0028,-79.759786,43.233758,POLYGON ((-79.75907562299994 43.23338874700005...
215230,180079364.0,L2N3P3,ON,350002230.0,2.0,2.0,1.0,0.0,LC0055,-79.239723,43.200875,POLYGON ((-79.23906047599991 43.20065100800008...
215231,180079364.0,L2N3P3,ON,350002230.0,2.0,2.0,1.0,0.0,LC0055,-79.239723,43.200875,POLYGON ((-79.23912685199991 43.20189413500009...


### Remove duplicated records

In [22]:
dup_cols = ['PCA_ID', 'POSTALCODE', 'PROV', 'MAF_ID', 'PREC_CODE', 'PCA_COUNT',
       'DOM_PCA', 'MULTI_PC', 'DEL_M_ID', 'LONGITUDE', 'LATITUDE']
ldu_gdf_nodup = ldu_gdf.drop_duplicates(subset=dup_cols)
mask1 = ldu_gdf_nodup['PCA_ID'].duplicated(keep=False)
ldu_gdf_nodup[mask1].sort_values('PCA_ID')

Unnamed: 0,PCA_ID,POSTALCODE,PROV,MAF_ID,PREC_CODE,PCA_COUNT,DOM_PCA,MULTI_PC,DEL_M_ID,LONGITUDE,LATITUDE,geometry


### Validate results

In [23]:
len(ldu_gdf_nodup)

555668

In [24]:
ldu_gdf_nodup['PCA_ID'].nunique()

555668

## Save results to a new shapefile

In [16]:
save_path = dmti_pg_path + '/nodup/ONldu_nodup.shp'
t = time()
ldu_gdf_nodup.to_file(save_path, index=False)
elapsed = time() - t
print("GeoDataFrame saved to file:\n", save_path,
      "\ntook {0:.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))

GeoDataFrame saved to file:
 ../../data/dmti/ONldu//nodup/ONldu_nodup.shp 
took 601.24 seconds (10.02 minutes)


## Save results to a .csv file

In [27]:
save_path = dmti_pg_path + '../ONldu_nodup.csv'
t = time()
ldu_gdf_nodup.to_csv(save_path, index=False)
elapsed = time() - t
print("DataFrame saved to file:\n", save_path,
      "\ntook {0:.2f} seconds ({1:,.2f} minutes)".format(elapsed, elapsed / 60))

DataFrame saved to file:
 ../../data/dmti/ONldu/../ONldu_nodup.csv 
took 151.05 seconds (2.52 minutes)
