In this note book we will gather our infrastructure data and merge it to each house.

In [1]:
import pandas as pd
import os
from datetime import datetime
import geopandas as gpd
from shapely.geometry import Point
from shapely import wkt
import matplotlib.pyplot as plt
from shapely.geometry import Polygon
import folium
import math

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%config Completer.use_jedi = False

First, let's import all the infrastructure csv files.

In [2]:
# importing the csv files
childcare = pd.read_csv('infrastructure_childcare.csv')
communities = pd.read_csv('infrastructure_communities.csv')
education = pd.read_csv('infrastructure_education.csv')
healthcare = pd.read_csv('infrastructure_healthcare.csv')
recreation = pd.read_csv('infrastructure_recreation.csv')
roads_bridges = pd.read_csv('infrastructure_roadsandbridges.csv')
transit = pd.read_csv('infrastructure_transit.csv')

We only care about a select few columns, so let's filter out the rest.  Then we can add all these data frames together.

In [3]:
# selecting relevant columns and concatinating all data into single df
columns_to_extract = ["Target Completion Date","Supporting Ministry","Project",
                      "Description","Area", "Status", "Estimated Total Budget", "Latitude", "Longitude"]


infrastructure_df = pd.concat([childcare, communities, education, healthcare, recreation, roads_bridges, transit]
                              , ignore_index=True)

infrastructure_df = infrastructure_df[columns_to_extract]

infrastructure_df.head()

Unnamed: 0,Target Completion Date,Supporting Ministry,Project,Description,Area,Status,Estimated Total Budget,Latitude,Longitude
0,Dec-18,Education,North Addington Education Centre Public Elemen...,Renovation of existing school,Lennox and Addington,Complete,945319,44.81607,-77.18565
1,Sep-18,Education,Roméo Dallaire Public Elementary School,Renovation of existing school,Durham,Complete,1953948,43.89823,-79.01472
2,Sep-17,Education,St Josephine Bakhita Catholic Elementary School,Expansion of existing school,Durham,Complete,2405382,43.89132,-79.03197
3,Sep-20,Education,Dr Roberta Bondar Public Elementary School,Renovation of existing school,Durham,Complete,958164,43.87113,-79.03333
4,Sep-20,Education,Lakeside Public Elementary School,Renovation of existing school,Durham,Complete,504171,43.82344,-79.02777


We only care about the regions relevant to Toronto, so let's filter for only projects in and around Toronto.

In [4]:
# filtering by regions in the GTA
areas_of_interest = ["Toronto", "Peel", "York", "Halton", "Durham"]
GTA_infrastructure = infrastructure_df[infrastructure_df['Area'].isin(areas_of_interest)]
GTA_infrastructure.head()

Unnamed: 0,Target Completion Date,Supporting Ministry,Project,Description,Area,Status,Estimated Total Budget,Latitude,Longitude
1,Sep-18,Education,Roméo Dallaire Public Elementary School,Renovation of existing school,Durham,Complete,1953948,43.89823,-79.01472
2,Sep-17,Education,St Josephine Bakhita Catholic Elementary School,Expansion of existing school,Durham,Complete,2405382,43.89132,-79.03197
3,Sep-20,Education,Dr Roberta Bondar Public Elementary School,Renovation of existing school,Durham,Complete,958164,43.87113,-79.03333
4,Sep-20,Education,Lakeside Public Elementary School,Renovation of existing school,Durham,Complete,504171,43.82344,-79.02777
5,Oct-20,Education,Roland Michener Public Elementary School,Renovation of existing school,Durham,Complete,252086,43.85407,-79.0382


How many of these projects are actually complete?

In [5]:
# counting the amount of complete projects there are in the infrastructure dataset
complete_count = (GTA_infrastructure['Status'] == 'Complete').sum()
total_count = len(GTA_infrastructure)

percentage_complete = (complete_count / total_count) * 100

print(f"Percentage of items marked as 'complete': {percentage_complete:.2f}%")
print(complete_count)

Percentage of items marked as 'complete': 63.17%
784


Let's assume that a project only influences a price once it is complete, so we can filter out the incomplete projects.

In [6]:
# removing any projects that are in the planning or construction phase
complete_inf = GTA_infrastructure[(GTA_infrastructure['Status'] == 'Complete')]


Now let's convert this infrastructure data into a datetime

In [7]:
# converting the dates in the infrastructure df into a datetime
def convert_to_datetime(date_str):
    """This function takes in a date in string formate and returns a date time"""
    month_dict = {
        'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
        'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
    }
    if '-' in date_str:
        parts = date_str.split('-')
        if len(parts) == 2:
            if len(parts[0]) == 3:  # Month comes first
                month, year = parts
            else:  # Year comes first
                year, month = parts
        else:
            return None  # Handle cases where the format is not as expected
    else:
        return None  # Handle cases where the format is not as expected

    year = int(year)
    month_num = month_dict[month]

    return datetime(2000 + year, month_num, 1)

# Convert 'Date' column to datetime
complete_inf['Target Completion Date'] = complete_inf['Target Completion Date'].apply(convert_to_datetime)


if pd.api.types.is_datetime64_any_dtype(complete_inf['Target Completion Date']):
    print("The 'DateColumn' is of datetime data type.")
else:
    print("The 'DateColumn' is not of datetime data type.")

# Display the updated GTA_infrastructure DataFrame
complete_inf.head()

The 'DateColumn' is of datetime data type.


Unnamed: 0,Target Completion Date,Supporting Ministry,Project,Description,Area,Status,Estimated Total Budget,Latitude,Longitude
1,2018-09-01,Education,Roméo Dallaire Public Elementary School,Renovation of existing school,Durham,Complete,1953948,43.89823,-79.01472
2,2017-09-01,Education,St Josephine Bakhita Catholic Elementary School,Expansion of existing school,Durham,Complete,2405382,43.89132,-79.03197
3,2020-09-01,Education,Dr Roberta Bondar Public Elementary School,Renovation of existing school,Durham,Complete,958164,43.87113,-79.03333
4,2020-09-01,Education,Lakeside Public Elementary School,Renovation of existing school,Durham,Complete,504171,43.82344,-79.02777
5,2020-10-01,Education,Roland Michener Public Elementary School,Renovation of existing school,Durham,Complete,252086,43.85407,-79.0382


To clean up our data set, let's remove any missing values

In [8]:
# removing missing date values from infrastructure dataframe
complete_inf=complete_inf.dropna()
has_missing_values = complete_inf.isna().any().any()


if has_missing_values:
    print("The DataFrame contains NaN or NaT values.")
else:
    print("The DataFrame does not contain NaN or NaT values.")

The DataFrame does not contain NaN or NaT values.


We need to convert this data frame into a GeoDataFrame, which is possible using the longitude and latitude of each project.

In [9]:
# converting lat and lon into one column called geometry
complete_inf['Latitude'] = pd.to_numeric(complete_inf['Latitude'], errors='coerce')
complete_inf['Longitude'] = pd.to_numeric(complete_inf['Longitude'], errors='coerce')

infrastructure_gdf = gpd.GeoDataFrame(complete_inf,geometry=gpd.points_from_xy(complete_inf['Longitude'],
                                                                               complete_inf['Latitude']))
infrastructure_gdf.head(20)


Unnamed: 0,Target Completion Date,Supporting Ministry,Project,Description,Area,Status,Estimated Total Budget,Latitude,Longitude,geometry
1,2018-09-01,Education,Roméo Dallaire Public Elementary School,Renovation of existing school,Durham,Complete,1953948,43.89823,-79.01472,POINT (-79.01472 43.89823)
2,2017-09-01,Education,St Josephine Bakhita Catholic Elementary School,Expansion of existing school,Durham,Complete,2405382,43.89132,-79.03197,POINT (-79.03197 43.89132)
3,2020-09-01,Education,Dr Roberta Bondar Public Elementary School,Renovation of existing school,Durham,Complete,958164,43.87113,-79.03333,POINT (-79.03333 43.87113)
4,2020-09-01,Education,Lakeside Public Elementary School,Renovation of existing school,Durham,Complete,504171,43.82344,-79.02777,POINT (-79.02777 43.82344)
5,2020-10-01,Education,Roland Michener Public Elementary School,Renovation of existing school,Durham,Complete,252086,43.85407,-79.0382,POINT (-79.03820 43.85407)
13,2020-09-01,Education,Aurora Children's,Expansion of existing building,York,Complete,3000000,43.962218,-79.467054,POINT (-79.46705 43.96222)
14,2019-09-01,Education,Social Enterprise for Canada Aurora EarlyON,Expansion of existing building,York,Complete,300000,43.990408,-79.455347,POINT (-79.45535 43.99041)
20,2021-01-01,Education,Hickory Wood Public Elementary School,Renovation of existing school,Peel,Complete,1326043,43.64629,-79.74281,POINT (-79.74281 43.64629)
22,2020-12-01,Education,Worthington Public Elementary School,Renovation of existing school,Peel,Complete,1085961,43.68444,-79.81877,POINT (-79.81877 43.68444)
24,2020-09-01,Education,Holy Spirit Catholic Elementary School,Renovation of existing school,Peel,Complete,1512511,43.78429,-79.72915,POINT (-79.72915 43.78429)


Now we can import the house data which was previously cleaned in an earlier notebook

In [10]:
#importing houses csv
houses = pd.read_csv('houses_data_final.csv')

# Create a geodataframe from the houses dataframe with longitude and latidue as geometry
houses = gpd.GeoDataFrame(houses, geometry = gpd.points_from_xy(houses['propertyLng'], houses['propertyLat']))
houses.crs = {'init': 'epsg:4326'}

In [11]:
# converting the dates in the houses df into a datetime
houses['Date'] = pd.to_datetime(houses['List Date'])
houses.drop(columns = ['List Date'], inplace = True)

if pd.api.types.is_datetime64_any_dtype(houses['Date']):
    print("The 'DateColumn' is of datetime data type.")
else:
    print("The 'DateColumn' is not of datetime data type.")

The 'DateColumn' is of datetime data type.


By changing this radius around each home we determined that a radius of 2 km yields the best results. We will create a cirlce as a radius of influence around each house.

In [12]:
# creating radius around each home address
def create_circle(point, radius_meters=2000):
    # Convert radius from meters to degrees (approximate)
    radius_degrees = radius_meters / (40008000 / 360)

    # Create a buffer (circle) around the point
    circle = point.buffer(radius_degrees)

    return circle

houses_gdf = gpd.GeoDataFrame(houses, geometry='geometry')
houses_gdf['circle'] = houses_gdf['geometry'].apply(create_circle)

In [13]:
#renamed columns so that geometry of houses_gdf is now a polygon
houses_gdf = houses_gdf.rename(columns={'geometry': 'point'})
houses_gdf = houses_gdf.rename(columns={'circle': 'geometry'})

houses_gdf.geometry.head()

0    POLYGON ((-79.54270 43.73370, -79.54279 43.731...
1    POLYGON ((-79.44310 43.76160, -79.44319 43.759...
2    POLYGON ((-79.44430 43.72140, -79.44439 43.719...
3    POLYGON ((-79.57010 43.72570, -79.57019 43.723...
4    POLYGON ((-79.39240 43.72660, -79.39249 43.724...
Name: geometry, dtype: geometry

In [14]:
infrastructure_gdf.geometry.head()

1    POINT (-79.01472 43.89823)
2    POINT (-79.03197 43.89132)
3    POINT (-79.03333 43.87113)
4    POINT (-79.02777 43.82344)
5    POINT (-79.03820 43.85407)
Name: geometry, dtype: geometry

We will only be able to work with numerical data, so let's remove any non-numeric data from the budget columns

In [15]:
# removing any non-numeric budget from estimated total budget column in infrastructure df
infrastructure_gdf['Estimated Total Budget'] = pd.to_numeric(infrastructure_gdf['Estimated Total Budget'],
                                                             errors='coerce')
infrastructure_gdf.dropna(subset=['Estimated Total Budget'], inplace=True)

infrastructure_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 696 entries, 1 to 4886
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Target Completion Date  696 non-null    datetime64[ns]
 1   Supporting Ministry     696 non-null    object        
 2   Project                 696 non-null    object        
 3   Description             696 non-null    object        
 4   Area                    696 non-null    object        
 5   Status                  696 non-null    object        
 6   Estimated Total Budget  696 non-null    int64         
 7   Latitude                694 non-null    float64       
 8   Longitude               694 non-null    float64       
 9   geometry                696 non-null    geometry      
dtypes: datetime64[ns](1), float64(2), geometry(1), int64(1), object(5)
memory usage: 59.8+ KB


In [16]:
houses_gdf.head()

Unnamed: 0.1,Unnamed: 0,address,isResidentialProperty,propertyLat,propertyLng,searchNeighborhood,Sold Price,Original Price,Type,Style,...,Parking Total,Furnished,Laundry Access,Private Entrance,Lease Term,Ensuite Laundry,Property Type,point,Date,geometry
0,0,82-riverdale-drive,"true,",43.7337,-79.5607,""""",",1100000.0,1195000.0,D.,2-Storey,...,,,,,,,,POINT (-79.56070 43.73370),2013-11-29,"POLYGON ((-79.54270 43.73370, -79.54279 43.731..."
1,1,185-honiton-street,"true,",43.7616,-79.4611,""""",",1008888.0,998000.0,D.,Sidesplit 4,...,,,,,,,,POINT (-79.46110 43.76160),2014-09-19,"POLYGON ((-79.44310 43.76160, -79.44319 43.759..."
2,2,75-mcadam-avenue,"true,",43.7214,-79.4623,""""",",830000.0,879000.0,D.,Bungaloft,...,,,,,,,,POINT (-79.46230 43.72140),2014-12-05,"POLYGON ((-79.44430 43.72140, -79.44439 43.719..."
3,3,155-mercury-road,"true,",43.7257,-79.5881,""""",",,,Detached,2-Storey,...,6.0,,,,,,,POINT (-79.58810 43.72570),2021-03-24,"POLYGON ((-79.57010 43.72570, -79.57019 43.723..."
4,4,227-woburn-avenue,"true,",43.7266,-79.4104,""""",",1425000.0,,Semi-Detached,2-Storey,...,1.0,,,,,,,POINT (-79.41040 43.72660),2020-08-18,"POLYGON ((-79.39240 43.72660, -79.39249 43.724..."


Now add the required columns to the houses_gdf, for now will the values with 0

In [17]:
houses_gdf["Education investment in area"] = 0
houses_gdf["Transportation investment in area"] = 0
houses_gdf["Colleges/Universities investment in area"] = 0
houses_gdf["Health investment in area"] = 0
houses_gdf["Other investment in area"] = 0

Finally, we can loop through all the infrastructure projects for all the houses and check if there are in that house's radius of influence and completed before that house was listed.  Then we can add the amount of budget, or money invested in that area, to that house.  Note: this takes a very long time. An iloc command has been added in lines 2 and 4 to only allow this loop to go through the first 50 infrastructure projects for the first 100 properties. If you would like to run on all the properties and projects, remove the iloc.  

In [18]:
# this loop checks if a project is in the radius of influence and before the houses list data
for index, inf_row in infrastructure_gdf.iloc[0:50].iterrows():
    is_within=[]
    for _, house_row in houses_gdf.iloc[0:100].iterrows():
        if inf_row['geometry'].within(house_row['geometry']) and (
            inf_row['Target Completion Date'] <= house_row['Date']):
            is_within.append(True)
        else:
            is_within.append(False)
            #indeces of the houses where the infrastructure exists in
            true_indices = [i for i, value in enumerate(is_within) if value]

# Adding the value to each column of the matching rows
for idx in true_indices:
    if inf_row['Supporting Ministry'] == "Education":
        houses_gdf.loc[idx, 'Education investment in area'] += inf_row['Estimated Total Budget']
    elif inf_row['Supporting Ministry'] == "Transportation":
        houses_gdf.loc[idx, 'Transportation investment in area'] += inf_row['Estimated Total Budget']
    elif inf_row['Supporting Ministry'] == "Colleges and Universities":
        houses_gdf.loc[idx, 'Colleges/Universities investment in area'] += inf_row['Estimated Total Budget']
    elif inf_row['Supporting Ministry'] == "Health":
        houses_gdf.loc[idx, 'Health investment in area'] += inf_row['Estimated Total Budget']
    else:
        houses_gdf.loc[idx, 'Other investment in area'] += inf_row['Estimated Total Budget']

houses_gdf.head()

Unnamed: 0.1,Unnamed: 0,address,isResidentialProperty,propertyLat,propertyLng,searchNeighborhood,Sold Price,Original Price,Type,Style,...,Ensuite Laundry,Property Type,point,Date,geometry,Education investment in area,Transportation investment in area,Colleges/Universities investment in area,Health investment in area,Other investment in area
0,0,82-riverdale-drive,"true,",43.7337,-79.5607,""""",",1100000.0,1195000.0,D.,2-Storey,...,,,POINT (-79.56070 43.73370),2013-11-29,"POLYGON ((-79.54270 43.73370, -79.54279 43.731...",0,0,0,0,0
1,1,185-honiton-street,"true,",43.7616,-79.4611,""""",",1008888.0,998000.0,D.,Sidesplit 4,...,,,POINT (-79.46110 43.76160),2014-09-19,"POLYGON ((-79.44310 43.76160, -79.44319 43.759...",0,0,0,0,0
2,2,75-mcadam-avenue,"true,",43.7214,-79.4623,""""",",830000.0,879000.0,D.,Bungaloft,...,,,POINT (-79.46230 43.72140),2014-12-05,"POLYGON ((-79.44430 43.72140, -79.44439 43.719...",0,0,0,0,0
3,3,155-mercury-road,"true,",43.7257,-79.5881,""""",",,,Detached,2-Storey,...,,,POINT (-79.58810 43.72570),2021-03-24,"POLYGON ((-79.57010 43.72570, -79.57019 43.723...",0,0,0,0,0
4,4,227-woburn-avenue,"true,",43.7266,-79.4104,""""",",1425000.0,,Semi-Detached,2-Storey,...,,,POINT (-79.41040 43.72660),2020-08-18,"POLYGON ((-79.39240 43.72660, -79.39249 43.724...",0,0,0,0,0


Now let's export our csv.  Note that the completed Houses_Infrastructure_2km.csv is provided in the repository, so there is no need to overwrite it if you did not go through all the houses and projects in the above loop.

In [None]:
# select relevent columns
houses_gdf = houses_gdf['address', 'Education investment in area', 'Transportation investment in area', 
                        'Colleges/Universities investment in area', 'Health investment in area',
                        'Other investment in area']
houses_gdf.to_csv('Houses_Infrastructure_2km.csv')