In [195]:
# Loading the required libraries
import pandas as pd
import csv
import json
import matplotlib.pyplot as plt
import shapely
import numpy as np
from geojson import Feature, FeatureCollection, Point
from shapely import wkt
import ogr
from shapely.geometry import *
from shapely.geometry import Polygon, mapping
from shapely import affinity
from pandas.io.json import json_normalize
import pysal
import libpysal
from libpysal.weights import Queen, Rook, KNN, Kernel, DistanceBand
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [196]:
# Building footprint data of Zurich
with open('Intelligent-Architectural-Briefs\data\osm-Zurich.json', 'r') as f:
    # Loading the json data
     data = json.load(f)
# normalizing it and converting to pandas data frame
df = pd.json_normalize(data['building'])

# check the useful columns 
print(df.columns)

Index(['primitive', 'points', 'type', 'units.points',
       'attributes.materialProperties.color',
       'attributes.materialProperties.linewidth', 'attributes.latitude',
       'attributes.longitude', 'attributes.type', 'attributes.postcode',
       'attributes.street', 'attributes.number', 'attributes.name'],
      dtype='object')


In [171]:
# Converting the points into shapely geometries
df['geometry'] = df['points'].apply(lambda x : (Polygon(x)))

# Extracting various features using "geometry"
## Find the perimeter of the built area
df['perimeter'] = df['geometry'].apply(lambda x : x.length)
## Building a site around the built area with minimum bounding box
df['Site'] = df['geometry'].apply(lambda x : x.minimum_rotated_rectangle)
## Find the perimeter of the site
df['Site_Perimeter'] = df['Site'].apply(lambda x : x.length)
## Find the total area of the site
df['Site_Area'] = df['Site'].apply(lambda x : x.area)
## Find the area of the built area
df['area'] = df['geometry'].apply(lambda x : x.area)


In [172]:
# View the data for inspection 
df

Unnamed: 0,primitive,points,type,units.points,attributes.materialProperties.color,attributes.materialProperties.linewidth,attributes.latitude,attributes.longitude,attributes.type,attributes.postcode,attributes.street,attributes.number,attributes.name,geometry,centroid,perimeter,Site,Site_Perimeter,Site_Area,area
0,polyline,"[[267.10011759676763, 822.1515493830386, 0], [...",building,meters,#ff0000,2,47.372677,8.541634,university,8092,Clausiusstrasse,,,POLYGON Z ((267.1001175967676 822.151549383038...,POINT (262.5948768335204 850.8652517394615),136.684773,"POLYGON ((246.1876463333146 872.9311087873746,...",137.858872,769.554333,736.628958
1,polyline,"[[198.26282673674294, 831.0014489009072, 0], [...",building,meters,#ff0000,2,47.372677,8.541634,university,8092,Haldeneggsteig,4,,POLYGON Z ((198.2628267367429 831.001448900907...,POINT (221.9218649991518 841.425139350535),229.186815,"POLYGON ((201.169244773844 808.9818166776527, ...",190.682629,2233.313002,1290.511838
2,polyline,"[[407.11174456558854, 360.9103712297666, 0], [...",building,meters,#ff0000,2,47.372677,8.541634,university,,Leonhardstrasse,3436,,POLYGON Z ((407.1117445655885 360.910371229766...,POINT (388.670650134147 384.1357712134738),275.400619,"POLYGON ((406.0462750512248 425.5525551806486,...",270.355557,4449.633528,3763.043509
3,polyline,"[[369.1309898450515, 434.62613803315287, 0], [...",building,meters,#ff0000,2,47.372677,8.541634,university,,Leonhardstrasse,34,,POLYGON Z ((369.1309898450515 434.626138033152...,POINT (360.9668668988073 441.9778624599625),60.046485,"POLYGON ((352.8010530732262 449.3338328020314,...",60.056104,209.258831,209.223854
4,polyline,"[[736.235565294488, 419.0080134750564, 0], [74...",building,meters,#ff0000,2,47.372677,8.541634,,8091,Gloriastrasse,29,,POLYGON Z ((736.235565294488 419.0080134750564...,POINT (761.3655117874878 434.2419529028348),235.460872,"POLYGON ((745.4535947729886 398.4915356516141,...",238.947627,3413.529070,1714.459847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094,polyline,"[[217.06473270592105, 1041.6067935323495, 0], ...",building,meters,#ff0000,2,47.372677,8.541634,,,Sumatrastrasse,40,,POLYGON Z ((217.064732705921 1041.606793532349...,POINT (230.0525554548207 1037.812822975954),72.082840,"POLYGON ((243.2048544302884 1035.664636668452,...",75.293962,349.339994,313.578665
1095,polyline,"[[693.7691914674982, 410.62565581844643, 0], [...",building,meters,#ff0000,2,47.372677,8.541634,hospital,,Gloriastrasse,27b,,POLYGON Z ((693.7691914674982 410.625655818446...,POINT (664.9989502495351 408.6389725522163),216.492629,"POLYGON ((656.9522115875819 374.6750584429823,...",218.543632,2792.125059,1972.702843
1096,polyline,"[[205.89968750122625, 877.466204357613, 0], [2...",building,meters,#ff0000,2,47.372677,8.541634,apartments,8006,Weinbergstrasse,46,,POLYGON Z ((205.8996875012263 877.466204357613...,POINT (195.6057645321793 882.2252818655875),56.684564,"POLYGON ((205.8996875012263 877.4662043576129,...",62.336981,242.080293,183.584754
1097,polyline,"[[495.4294384025294, 124.5790922759327, 0], [5...",building,meters,#ff0000,2,47.372677,8.541634,roof,,,,,POLYGON Z ((495.4294384025294 124.579092275932...,POINT (500.8839228664368 130.6848845342281),45.450707,"POLYGON ((495.1467933311993 124.4933527303446,...",45.898593,122.540952,120.473152


In [173]:
# Finding the number of neighbors at different distances using pySAL library
## The number of immediate Neighbors - sharing edges
rW = Rook.from_dataframe(df)
df['neighbors'] = rW.cardinalities.values()

## Number of neigbhors in 200 mtr proximity
dis_band = DistanceBand.from_dataframe(df, threshold = 200)
df['neighbors_in_200'] = dis_band.cardinalities.values()

## Number of neigbhors in 100 mtr proximity
dis_band = DistanceBand.from_dataframe(df, threshold = 100)
df['neighbors_in_100'] = dis_band.cardinalities.values()

## Number of neigbhors in 50 mtr proximity
dis_band = DistanceBand.from_dataframe(df, threshold = 50)
df['neighbors_in_50'] = dis_band.cardinalities.values()

 There are 493 disconnected components.
 There are 306 islands with ids: 0, 1, 3, 6, 10, 15, 16, 17, 18, 20, 21, 22, 23, 25, 26, 27, 28, 31, 34, 35, 44, 45, 46, 48, 49, 50, 51, 53, 56, 57, 59, 61, 63, 64, 65, 66, 69, 73, 75, 77, 78, 80, 91, 93, 94, 95, 99, 100, 101, 102, 103, 104, 106, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 126, 127, 131, 133, 134, 136, 141, 143, 144, 145, 148, 154, 155, 158, 162, 171, 172, 174, 182, 185, 186, 236, 265, 266, 269, 270, 271, 272, 274, 277, 280, 282, 283, 298, 331, 332, 333, 335, 338, 341, 342, 345, 347, 348, 349, 350, 358, 359, 361, 362, 363, 365, 366, 367, 370, 372, 376, 377, 378, 379, 381, 382, 386, 387, 388, 395, 403, 406, 407, 412, 417, 421, 422, 423, 425, 428, 436, 444, 447, 454, 462, 469, 470, 476, 480, 486, 488, 489, 490, 493, 495, 498, 501, 505, 506, 508, 509, 510, 511, 513, 515, 516, 517, 518, 520, 521, 527, 530, 532, 535, 536, 540, 542, 543, 544, 546, 547, 548, 552, 553, 555, 557, 558, 561, 564, 566, 568

In [174]:
# Select the Important and required features from the dataset
df = df[['attributes.type','area', 'attributes.street', 'attributes.postcode', 'neighbors_in_200', 'neighbors', 'Site_Perimeter', 'Site_Area', 'perimeter', 'neighbors_in_100','neighbors_in_50']]

In [175]:
# Check the number of NA values in each column
df.isnull().sum()

attributes.type        975
area                     0
attributes.street      134
attributes.postcode     89
neighbors_in_200         0
neighbors                0
Site_Perimeter           0
Site_Area                0
perimeter                0
neighbors_in_100         0
neighbors_in_50          0
dtype: int64

In [176]:
# Remove the NA values from the dataset
df1 = df.dropna()

In [178]:
# Convert the character variables (Building Programs, Streets, Postcode) into first categorical
# and then street and postcode to numerical too, we will not the target variable (Building Program / attributes.type) 
# to numerical

df1['attributes.type'] = df1['attributes.type'].astype('category')

df1['attributes.street'] = df1['attributes.street'].astype('category')
df1['attributes.street'] = df1['attributes.street'].cat.codes

df1['attributes.postcode'] = df1['attributes.postcode'].astype('category')
df1['attributes.postcode'] = df1['attributes.postcode'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy o

In [179]:
# Inspect the Format of the features of the new Data
df1.dtypes

attributes.type        category
area                    float64
attributes.street          int8
attributes.postcode        int8
neighbors_in_200          int32
neighbors                 int32
Site_Perimeter          float64
Site_Area               float64
perimeter               float64
neighbors_in_100          int32
neighbors_in_50           int32
dtype: object

In [180]:
# Divide the data into features and labels(Building Programs)
y = df1['attributes.type']
df1.drop(['attributes.type'],axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [181]:
# Divide the dataset into training and testing data; X_train - training features, X_test - testing features, y_train - training labels, y_test - testing features
X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size=0.2)

In [182]:
clf = RandomForestClassifier(random_state=0)
param_grid = {'n_estimators': np.arange(50, 200)}
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, verbose=3, return_train_score=True)

In [183]:
# perform grid search, to find the optimal parameter
grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 150 candidates, totalling 750 fits
[CV] n_estimators=50 .................................................
[CV] . n_estimators=50, score=(train=1.000, test=0.333), total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] . n_estimators=50, score=(train=1.000, test=0.429), total=   0.1s
[CV] n_estimators=50 .................................................
[CV] . n_estimators=50, score=(train=1.000, test=0.643), total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] . n_estimators=50, score=(train=1.000, test=0.500), total=   0.1s
[CV] n_estimators=50 .................................................
[CV] . n_estimators=50, score=(train=1.000, test=0.429), total=   0.1s
[CV] n_estimators=51 .................................................
[CV] . n_estimators=51, score=(train=1.000, test=0.333), total=   0.1s
[CV] n_estimators=51 .................................................
[CV] . n_estimators=51, score=(train=1.000, test=0.429), total=   0.1s
[CV] n_estimators=51 .................................................
[CV] . n_estimators=51, score=(train=1.000, test=0.643), total=   0.1s
[CV] n_estimators=51 .................................................
[CV] . n_estimators=51, score=(train=1.000, test=0.500), total=   0.1s
[CV] n_estimators=51 .................................................
[CV] . n_estimators=51, score=(train=1.000, test=0.429), total=   0.1s
[CV] n_estimators=52 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done 750 out of 750 | elapsed:  4.1min finished


In [185]:
# what was the best parameter ?
print(grid_search.best_params_)

{'n_estimators': 53}


In [194]:
# Predict the building programs for test data set
y_pred = grid_search.predict(X_test)

# Get the Accuracy Score
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

Accuracy: 0.6111


In [187]:
# to get a detailed classification report of the ML model we can use classification_report() from the metric module
classification_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n  apartments       0.71      1.00      0.83         5\n  commercial       0.00      0.00      0.00         2\n    hospital       1.00      0.67      0.80         3\n      office       1.00      0.50      0.67         2\n      public       0.00      0.00      0.00         1\n      school       0.40      0.67      0.50         3\n  university       0.33      0.50      0.40         2\n\n    accuracy                           0.61        18\n   macro avg       0.49      0.48      0.46        18\nweighted avg       0.58      0.61      0.57        18\n'

In [188]:
# Confusion matrix for mapping the actual values to the predicted ones
confusion_matrix(y_test, y_pred)

array([[5, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 2, 0],
       [0, 0, 2, 0, 0, 0, 1],
       [1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 2, 1],
       [1, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [190]:
# 2 columns showing the real building program and the predicted one
output = pd.DataFrame({'Real-Programs': y_test, 'Predicted-Programs': y_pred})
output

Unnamed: 0,Real-Programs,Predicted-Programs
275,school,school
9,apartments,apartments
116,apartments,apartments
566,university,apartments
1085,hospital,university
616,commercial,school
1083,hospital,hospital
580,hospital,hospital
6,office,apartments
694,school,school
