### The datasets in this article describe the Green View Index of the city of Helsinki, Finland, determined using Google Street View images. The datasets are in GeoPackage format, which is an open standard format for transferring geospatial information.

In [101]:
# The raw datasets came in geopackage format

import geopandas as gpd
import pandas as pd
fp = "C:/Users/Rasha/Downloads/greenery_points.gpkg"
data = gpd.read_file(fp)
type(data)

geopandas.geodataframe.GeoDataFrame

In [102]:
data.head(10)

Unnamed: 0,panoID,panoDate,longitude,lattitude,Gvi_Mean,geometry
0,0reUq0bM0lUe-4qtxnkL7A,2014-07,24.859928,60.207972,57.474167,POINT (24.85993 60.20797)
1,03RhquCGbfKkdvrcK2awoQ,2009-06,24.852589,60.183729,42.480104,POINT (24.85259 60.18373)
2,0RxL9q5IxsZlTGl-OY4Yvg,2009-06,24.867728,60.188102,47.016979,POINT (24.86773 60.18810)
3,0O18VDfOf43j-JiA1j5aIQ,2014-07,24.882412,60.212695,59.398021,POINT (24.88241 60.21270)
4,0HqCb--cxO4-G1ld4qft6Q,2014-07,24.855513,60.208596,39.601042,POINT (24.85551 60.20860)
5,0-7PhgghIwZOs-Im4phbbA,2009-07,24.858713,60.168149,51.988958,POINT (24.85871 60.16815)
6,00vEYd2d3G6d3S8iRxbK-w,2014-07,24.866081,60.217752,15.755313,POINT (24.86608 60.21775)
7,0L03PbOqF01axGdWO5eHJg,2014-07,24.855912,60.206752,12.361458,POINT (24.85591 60.20675)
8,045xK5goZO63e2Y6-tAGiA,2009-06,24.877635,60.199098,37.298646,POINT (24.87763 60.19910)
9,0eeLDgr4v0hdQdLlGvYcPw,2014-07,24.8546,60.217572,28.015208,POINT (24.85460 60.21757)


In [103]:
#Converting to csv

data.to_csv("C:/Users/Rasha/Downloads/greenery_points.csv")

In [104]:
data.shape

(92126, 6)

### Initially our raw datasets had 92126 instances and 6 features; the pano ID represents the unique ID assigned for the pano image from google, pano date represents the year and month the panorama was taken , longitude and latitude tells the location where the panorama was taken and green view index(GVI) mean describes the average greenery in that particular location. 

## Data Cleaning

In [105]:
#removing unwanted columns/features (basic data cleaning)

data = data.drop('panoID', axis = 1)
data = data.drop('geometry', axis = 1)

### The software that we would be using to plot and locate in our map supports only limited amount of data. That's why we would be removing some instances and also we've removed panoId and geometry column as it didn't allow us to coordinate with our map

In [106]:
data.head(5)

Unnamed: 0,panoDate,longitude,lattitude,Gvi_Mean
0,2014-07,24.859928,60.207972,57.474167
1,2009-06,24.852589,60.183729,42.480104
2,2009-06,24.867728,60.188102,47.016979
3,2014-07,24.882412,60.212695,59.398021
4,2014-07,24.855513,60.208596,39.601042


In [53]:
data.shape
#2 features are removed. So now we only have 4 features

(92126, 4)

In [54]:
data.dropna(how='any').shape
#This proves that we have no missing data in any of our instances

(92126, 4)

In [55]:
type(data)

geopandas.geodataframe.GeoDataFrame

In [56]:
data = data.sort_values(by = 'panoDate', ascending=True)
data.head(10)

Unnamed: 0,panoDate,longitude,lattitude,Gvi_Mean
60984,2009-05,25.078503,60.25476,25.386979
85649,2009-05,25.119039,60.205377,0.140521
77757,2009-05,25.158941,60.208746,15.35625
19448,2009-05,25.084323,60.264524,37.638542
42944,2009-05,25.12201,60.204892,13.208021
5129,2009-05,25.079586,60.255649,33.331146
86715,2009-05,25.078123,60.25473,26.276042
19456,2009-05,25.08444,60.264243,31.609896
58825,2009-05,25.070371,60.263041,49.571354
59608,2009-05,25.084536,60.264308,33.310208


In [57]:
cols = ['Pano_Date','Longitude','Latitude','Green_View_Index(Mean)']
data.columns = cols

In [58]:
data.index.rename('index', inplace=True)
data.to_csv("C:/Users/Rasha/Downloads/final_gvi.csv")

In [59]:
#Renaming indices
data = gpd.read_file("C:/Users/Rasha/Downloads/final_gvi.csv")
data.head()

Unnamed: 0,index,Pano_Date,Longitude,Latitude,Green_View_Index(Mean),geometry
0,60984,2009-05,25.078503,60.25476,25.3869791667,
1,85649,2009-05,25.119039,60.205377,0.140520833333,
2,77757,2009-05,25.158941,60.208746,15.35625,
3,19448,2009-05,25.084323,60.264524,37.6385416667,
4,42944,2009-05,25.12201,60.204892,13.2080208333,


In [66]:
data = data.drop('geometry', axis = 1)

In [67]:
data = data.drop('index', axis = 1)

In [69]:
data = data.drop('Unnamed: 0', axis = 1)

In [70]:
data.head()

Unnamed: 0,Pano_Date,Longitude,Latitude,Green_View_Index(Mean)
0,2009-05,25.078503,60.25476,25.386979
1,2009-05,25.119039,60.205377,0.140521
2,2009-05,25.158941,60.208746,15.35625
3,2009-05,25.084323,60.264524,37.638542
4,2009-05,25.12201,60.204892,13.208021


In [71]:
#brief description of each feature
data.dtypes

Pano_Date                  object
Longitude                 float64
Latitude                  float64
Green_View_Index(Mean)    float64
dtype: object

In [87]:
data = data.drop(data.index[4000:])
data.shape

(4000, 4)

In [88]:
cols = ['PanoDate','Longitude','Latitude','gvi']
data.columns = cols

In [89]:
data = data.sort_values(by = 'gvi', ascending=True)

In [91]:
#exporting csv file with proper indices
data.to_csv("C:/Users/Rasha/Downloads/final_gvi.csv")

In [92]:
data = pd.read_csv("C:/Users/Rasha/Downloads/final_gvi.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,PanoDate,Longitude,Latitude,gvi
0,0,2009-05,25.119039,60.205377,0.140521
1,1,2009-05,25.119803,60.205237,0.159375
2,2,2009-05,25.119497,60.205293,0.181354
3,3,2009-06,25.145106,60.207533,0.256979
4,4,2009-05,25.11965,60.205265,0.292604


In [107]:
data = pd.read_csv("C:/Users/Rasha/Downloads/final_gvi.csv")
data.head()

Unnamed: 0,PanoDate,Longitude,Latitude,gvi,Status
0,2009-05,25.119039,60.205377,0.140521,Yes
1,2009-05,25.119803,60.205237,0.159375,Yes
2,2009-05,25.119497,60.205293,0.181354,Yes
3,2009-06,25.145106,60.207533,0.256979,Yes
4,2009-05,25.11965,60.205265,0.292604,Yes


## Training

In [110]:
X = data.iloc[:, 3].values
y = data.iloc[:, -1].values

In [111]:
print(X)

[ 0.14052083  0.159375    0.18135417 ... 63.49416667 65.00291667
 65.40854167]


In [112]:
print(y)

['Yes' 'Yes' 'Yes' ... 'No' 'No' 'No']


In [122]:
print(type(X))


<class 'numpy.ndarray'>


In [114]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [115]:
print(y)

[1 1 1 ... 0 0 0]


### Splitting the dataset into training and test test

In [116]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [123]:
import numpy as np
X_train = np.reshape(X_train, (-1,1))
X_test = np.reshape(X_test, (-1,1))
y_train = np.reshape(y_train, (-1,1))
y_test = np.reshape(y_test, (-1,1))

### Training using KNN algorithm

In [124]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)

  return self._fit(X, y)


KNeighborsClassifier()

In [126]:
print(knn.predict([[28.92]]))

[1]


In [127]:
print(knn.predict([[30.99]]))

[0]


In [128]:
y_pred = knn.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[0 0]
 [1 1]
 [0 0]
 ...
 [0 0]
 [1 1]
 [1 1]]


### Confusion Matrix

In [130]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[432   1]
 [  0 367]]


### Accuracy

In [131]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.99875