In [1]:
from shapely.geometry import Polygon, LineString, Point
import pandas as pd
import numpy as np
import re
import geopandas as gpd
import matplotlib.pyplot as plt

In [2]:
# Everything must be in epsg = 32617 or 3857
# 3857 makes it a smaller area than 32617
# 32617 is closest to LANDAREA (when it exists)

In [7]:
station_data = pd.read_excel("../station-movers-V1.xlsx")
da2021 = pd.read_csv("../../data_raw/cleaned_data/2021_census_data.csv")
da2016 = pd.read_csv("../../data_raw/cleaned_data/2016_census_data.csv")
da2011 = pd.read_csv("../../data_raw/cleaned_data/2011_census_data.csv")
da2006 = pd.read_csv("../../data_raw/cleaned_data/2006_census_data.csv")
da2001 = pd.read_csv("../../data_raw/cleaned_data/2001_census_data.csv")
ct1996 = pd.read_csv("../../data_raw/cleaned_data/1996_census_data.csv")
ct1991 = pd.read_csv("../../data_raw/cleaned_data/1991_census_data.csv")
ct1986 = pd.read_csv("../../data_raw/cleaned_data/1986_census_data.csv")
ct1981 = pd.read_csv("../../data_raw/cleaned_data/1981_census_data.csv")
ct1976 = pd.read_csv("../../data_raw/cleaned_data/1976_census_data.csv")
boundaries_2021 = gpd.read_file("../../data_raw/boundary_data/lda_000b21a_e/lda_000b21a_e.shp")
boundaries_2016 = gpd.read_file("../../data_raw/boundary_data/lda_000b16a_e/lda_000b16a_e.shp")
boundaries_2011 = gpd.read_file("../../data_raw/boundary_data/gda_000b11a_e/gda_000b11a_e.shp")
boundaries_2006 = gpd.read_file("../../data_raw/boundary_data/gda_000b06a_e/gda_000b06a_e.shp")
boundaries_2001 = gpd.read_file("../../data_raw/boundary_data/gda_000b02m_e/gda_000b02m_e.MID")
boundaries_1996 = gpd.read_file("../../data_raw/CT_data/1996/ct_1996.geojson")
boundaries_1991 = gpd.read_file("../../data_raw/CT_data/1991/ct_1991.geojson")
boundaries_1986 = gpd.read_file("../../data_raw/CT_data/1986/ct_1986.geojson")
boundaries_1981 = gpd.read_file("../../data_raw/CT_data/1981/ct_1981.geojson")
boundaries_1976 = gpd.read_file("../../data_raw/CT_data/1976/ct_1976.geojson")

  return ogr_read(


In [4]:
station_gdf = gpd.GeoDataFrame(station_data, geometry=gpd.points_from_xy(station_data['X'], station_data['Y']))
station_gdf.set_crs(epsg=4326, inplace=True)
station_gdf = station_gdf.to_crs(epsg=32617)
station_gdf["buffer"] = station_gdf.buffer(800)

In [5]:
boundaries_2021["DAUID"] = boundaries_2021["DAUID"].astype(int)
boundaries_2021 = boundaries_2021.to_crs(epsg=32617)
boundaries_2021 = boundaries_2021[["DAUID","geometry"]]

In [6]:
boundaries_2016["DAUID"] = boundaries_2016["DAUID"].astype(int)
boundaries_2016 = boundaries_2016.to_crs(epsg=32617)
boundaries_2016 = boundaries_2016[["DAUID","geometry"]]

In [7]:
boundaries_2011["DAUID"] = boundaries_2011["DAUID"].astype(int)
boundaries_2011 = boundaries_2011.to_crs(epsg=32617)
boundaries_2011 = boundaries_2011[["DAUID","geometry"]]

In [8]:
boundaries_2006["DAUID"] = boundaries_2006["DAUID"].astype(int)
boundaries_2006 = boundaries_2006.to_crs(epsg=32617)
boundaries_2006 = boundaries_2006[["DAUID","geometry"]]

In [9]:
boundaries_2001["DAUID"] = boundaries_2001["DAUID"].astype(int)
boundaries_2001 = boundaries_2001.to_crs(epsg=32617)
boundaries_2001 = boundaries_2001[["DAUID","geometry"]]

In [10]:
boundaries_1996 = boundaries_1996.to_crs(epsg=32617)
boundaries_1996["geosid"] = boundaries_1996["geosid"].astype(float)
boundaries_1996["area"] = boundaries_1996["areakm"] * 1000000 

In [11]:
boundaries_1991 = boundaries_1991.to_crs(epsg=32617)
boundaries_1991["geosid"] = boundaries_1991["geosid"].astype(float)
boundaries_1991["area"] = boundaries_1991["areakm"] * 1000000 

In [12]:
boundaries_1986 = boundaries_1986.to_crs(epsg=32617)
boundaries_1986["geosid"] = boundaries_1986["geosid"].astype(float)
boundaries_1986["area"] = boundaries_1986["areakm"] * 1000000 

In [13]:
boundaries_1981 = boundaries_1981.to_crs(epsg=32617)
boundaries_1981["geosid"] = boundaries_1981["geosid"].astype(float)
boundaries_1981["area"] = boundaries_1981["areakm"] * 1000000 

In [14]:
boundaries_1976 = boundaries_1976.drop(index=[2310, 2484])

boundaries_1976 = boundaries_1976.to_crs(epsg=32617)
boundaries_1976["geosid"] = boundaries_1976["geosid"].astype(float)
boundaries_1976["area"] = boundaries_1976["areakm"] * 1000000 

In [15]:
mapping_years = {
    2021: [boundaries_2021, da2021],
    2016: [boundaries_2016, da2016],
    2011: [boundaries_2011, da2011],
    2006: [boundaries_2006, da2006],
    2001: [boundaries_2001, da2001],
    
    1996: [boundaries_1996, ct1996],
    1991: [boundaries_1991, ct1991],
    1986: [boundaries_1986, ct1986],
    1981: [boundaries_1981, ct1981],
    1976: [boundaries_1976, ct1976],
}

## Functions

In [1]:
def ten_yr_span(dates, my_list):
    top_year = max(my_list)
    bottom_year = min(my_list)

    closest = min(my_list, key=lambda x: abs(x - dates))
    if closest == bottom_year:
        return closest, closest + 5, closest + 10
    elif closest == top_year:
        return closest - 10, closest - 5, closest
    return closest - 5, closest, closest + 5
        

In [12]:
ten_yr_span(2009, [1996, 2001, 2006, 2011, 2016])

(2006, 2011, 2016)

In [17]:
def get_dates(data, date_col):
    """
    Get the 10_yr_span of dates
    Add this is before/after date columns to the dataframe
    Saves it in_place
    """
    dates = pd.to_datetime(data[date_col])
    years = dates.dt.year
    list_of_years = [1976, 1981, 1986, 1991, 1996, 2001, 2006, 2011, 2016, 2021]
    set_of_years = years.apply(lambda x: ten_yr_span(x, list_of_years))
    years_df = pd.DataFrame(set_of_years.tolist(), columns=[f"before_{date_col}", f"middle_{date_col}", f"after_{date_col}"])
    return_df = pd.concat([data, years_df], axis=1)
    return return_df

In [18]:
def find_overlap(buffer_area, boundary_data):
    """
    Given a buffer area, find the overlapping regions
    Get the census data of the relevant year
    1) Find the overlapping regions
    2) Calculate the area of overlap
    3) Find the proportion of each areas (if proportion > 1; proportion = 1)
    """
    overlapping = boundary_data[boundary_data.intersects(buffer_area)].reset_index()    
    overlapping.loc[:,"overlap"] = overlapping.geometry.intersection(buffer_area).area
    overlapping["proportion"] = overlapping["overlap"] / overlapping.area
    overlapping["proportion"] = overlapping["proportion"].where(overlapping["proportion"] <= 1, 1)
    return overlapping

In [19]:
def find_data(boundary_data, census_data, year):
    """
    1) Merge the boundary and census data
    2) Mulitply the census data by the proportion
    3) Sum all the census data 
    4) Return a row of values
    """
    if year >= 2001:
        combined = boundary_data.merge(census_data, left_on = "DAUID", right_on = "GeoUID")
        useful = combined.iloc[:,4:]
        useful_portioned = useful.mul(useful["proportion"], axis=0)
        summed_row = useful_portioned.sum()[3:]
        return summed_row
    else:
        combined = boundary_data.merge(census_data, on = "geosid")
        useful = combined.iloc[:,16:]
        useful_portioned = useful.mul(useful["proportion"], axis=0)
        summed_row = useful_portioned.sum()[1:]
        return summed_row

In [20]:
def join_rows(array_of_series):
    df = pd.DataFrame(array_of_series)
    return df

## Running Everything

In [21]:
station_gdf = get_dates(station_gdf, "opening_date")

  dates = pd.to_datetime(data[date_col])


In [22]:
each_row = []

for i, row in station_gdf.iterrows():
    before_year = row["before_opening_date"]
    middle_year = row["middle_opening_date"]
    after_year = row["after_opening_date"]
    area = row["buffer"]

    before_b_data = mapping_years[before_year][0]
    before_c_data = mapping_years[before_year][1]
    middle_b_data = mapping_years[middle_year][0]
    middle_c_data = mapping_years[middle_year][1]
    after_b_data = mapping_years[after_year][0]
    after_c_data = mapping_years[after_year][1]
  
    before_overlap = find_overlap(area, before_b_data)
    before_row = find_data(before_overlap, before_c_data, before_year)
    middle_overlap = find_overlap(area, middle_b_data)
    middle_row = find_data(middle_overlap, middle_c_data, middle_year)
    after_overlap = find_overlap(area, after_b_data)
    after_row = find_data(after_overlap, after_c_data, after_year)

    before_row.index = ["Before " + str(i) for i in before_row.index]
    middle_row.index = ["Middle " + str(i) for i in middle_row.index]
    after_row.index = ["After " + str(i) for i in after_row.index]

    full_row = pd.concat([before_row, middle_row, after_row])
    each_row.append(full_row)

In [23]:
full = join_rows(each_row)

In [24]:
ordered_columns = [
        'Before Population Density per square kilometre', 'Before Dwellings',
        'Before Total Occupied Private Dwellings',
        'Before Single-detached house', 'Before Semi-detached house',
        #'Before Other attached dwelling',
        'Before Row house', 'Before Apartment',
        'Before Apartment, duplex',
        'Before Apartment, building that has fewer than five storeys',
        'Before Apartment, building that has five or more storeys',
        'Before Other dwelling', 'Before Other single-attached house',
        'Before Movable dwelling', 
        'Before No bedrooms',
        'Before 0 to 1 bedroom', 
        'Before 1 bedroom', 'Before 2 bedrooms',
        'Before 3 bedrooms', 
        'Before 4 bedrooms', 
        'Before 4 or more bedrooms', 
        'Before 5 or more bedrooms', 
        'Before Average number of bedrooms per dwelling', 
        'Before Owned',
        'Before Rented', 

        'Middle Population Density per square kilometre', 'Middle Dwellings',
        'Middle Total Occupied Private Dwellings',
        'Middle Single-detached house', 'Middle Semi-detached house',
        'Middle Other attached dwelling',
        'Middle Row house', #'Middle Apartment',
        'Middle Apartment, duplex',
        'Middle Apartment, building that has fewer than five storeys',
        'Middle Apartment, building that has five or more storeys',
        'Middle Other dwelling', 'Middle Other single-attached house',
        'Middle Movable dwelling', 
        'Middle No bedrooms',
        'Middle 0 to 1 bedroom', 
        'Middle 1 bedroom', 'Middle 2 bedrooms',
        'Middle 3 bedrooms', 
        'Middle 4 bedrooms', 
        'Middle 4 or more bedrooms', 
        'Middle 5 or more bedrooms',
        'Middle Average number of bedrooms per dwelling', 
        'Middle Owned',
        'Middle Rented',
        
        'After Population Density per square kilometre', 'After Dwellings',
        'After Total Occupied Private Dwellings',
        'After Single-detached house', 'After Semi-detached house',
        'After Other attached dwelling',
        'After Row house', 
        'After Apartment, duplex',
        'After Apartment, building that has fewer than five storeys',
        'After Apartment, building that has five or more storeys',
        'After Other dwelling', 'After Other single-attached house',
        'After Movable dwelling', 
        'After No bedrooms',
        'After 0 to 1 bedroom', 
        'After 1 bedroom', 'After 2 bedrooms',
        'After 3 bedrooms',
        # 'After 4 bedrooms', 
        'After 4 or more bedrooms', 
        # 'After 5 or more bedrooms',
        'After Average number of bedrooms per dwelling',
        'After Owned',
        'After Rented'
]

full = full[ordered_columns]

In [25]:
combined_df = pd.concat([station_gdf, full], axis=1)

In [26]:
combined_df.to_csv('../data/tod-on-main.csv', index=False)