## Cluster roads ##

For each street calc historical averages per timestep

Use k means to cluster

In [None]:
import yaml
import numpy as np
import pandas as pd

# We want to import stuff from parent directory
import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))

from src import db_requests

In [None]:
## CONFIGURATIONS #####

with open("../db.yaml", 'r') as dbfi:
    db_credentials = yaml.safe_load(dbfi)

source_city = 'wolfsburg'
save_path = '../data'
min_measurements = 3000
n_clusters = 15

# Time
train_date_begin = "2018-12-01"
train_date_end = "2019-01-16"
#test_date_begin = "2019-01-17"
#test_date_end = "2019-01-31"
hour_from = "7"
hour_to = "21"

In [None]:
# Get Traffic data

traffic = db_requests.getTrafficDataMinMeasurements(source_city, min_measurements, train_date_begin, train_date_end, hour_from, hour_to, db_credentials)
traffic = traffic.rename(columns={'time': 'datetime'})
traffic["time"] = traffic['datetime'].dt.time

In [None]:
traffic["dow-tod"] = traffic['datetime'].dt.dayofweek.astype(str) + "-" + traffic["time"].astype(str)

In [None]:
traffic

### Get historical averages of Roads ###

In [None]:
# We need the average speed for each street at each timestep

#traffic_pivot = pd.pivot_table(traffic,index=["id", "time"],values=["speed"],aggfunc=np.mean)
traffic_pivot = pd.pivot_table(traffic,index=["id"], columns = "dow-tod",values=["speed"],aggfunc=np.mean)


In [None]:
traffic_pivot

### Cluster Roads ###

In [None]:
from sklearn.cluster import KMeans

In [None]:
# There are NaN Values, which leads to errors with k means
# fill them with 60
traffic_pivot = traffic_pivot.fillna(0)

In [None]:
X = traffic_pivot

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)

In [None]:
# Create DF with columns "id" and "cluster" and fill
traffic_clustered = pd.DataFrame(index=traffic_pivot.index)
traffic_clustered['cluster'] = kmeans.labels_

In [None]:
# Save labels
traffic_clustered[['cluster']].to_csv(save_path + f'/labels_{source_city}_{n_clusters}.csv')

# Plot clusters

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 5]

In [None]:
#for i in range(1,11):
#    n_clusters = i


#    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(traffic_pivot)
#    centroids = pd.DataFrame(np.transpose(kmeans.cluster_centers_))
#    fig = centroids.plot()
#    fig.grid()
#    fig.set_title("k=%s" % i)

### Check Clusters ###

1. Get geometry column of streetgraph
2. Merge with traffic_clustered
3. Print clusters on map

In [None]:
# Get geometry column from streetgraph in db
roads_geom = db_requests.getGeometry(db_credentials)
# Set index to id (for easy join)
roads_geom = roads_geom.set_index('id')
# Join to clustered data
traffic_clustered = roads_geom.join(traffic_clustered, how='right')

In [None]:
# Change cluster to str, so it is treated as category
traffic_clustered['cluster'] = traffic_clustered['cluster'].apply(str)
# Draw map 
m = traffic_clustered.explore(
     column="cluster", # make choropleth based on "BoroName" column
     tooltip="cluster", # show "BoroName" value in tooltip (on hover)
     popup=True, # show all values in popup (on click)
     cmap="Set1" # use "Set1" matplotlib colormap
    )

m

In [None]:
#m.save(str(n_clusters) + "_clusters.html")

In [None]:
# Check by looking at data

# Join traffic data
traffic_clustered = traffic_clustered.join(traffic_pivot)


In [None]:
# Look at clusters
traffic_clustered.loc[traffic_clustered['cluster'] == "4"]

Traffic clusters mostly driven by similar speed, not by specific behaviour.

Normalize speed with speedlimit? -> rel. difference of speed to speedlimit

### Save the labels ###

Safe the id and the corresponding cluster

In [None]:
traffic_clustered[['cluster']].to_csv(save_path + f'/labels_{source_city}.csv')