After deriving a traffic density estimate from our Envirocar data, we can now build a model using open traffic data of Münster city (found [here](https://traffics.codeformuenster.org/)). Several models will be tested.

# LOAD DATA
For better accessibility all data is saved as shape files.

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as cx
import pandas as pd
from mpl_toolkits.axes_grid1 import make_axes_locatable
from shapely.geometry import Point
import os

In [None]:
filepath = os.path.join(os.getcwd(), "data")

## Traffic density estimation

In [None]:
# read in estimated traffic density we created from EnviroCar tracks

# 2018 data
results = gpd.read_file(os.path.join(filepath, "results", "muenster_2018_est_traffic_density.shp"))

# we need to fix the columns headers (got clipped during saving)
results.rename(columns={"num_tracks": "num_tracks_weekend", "num_trac_1": "num_tracks_weekday", "num_trac_2": "num_tracks_total"}, inplace=True)
results.head()

In [None]:
# to see more clearly, exclude streets with no tracks on them
results=results.loc[results["num_tracks_total"] > 0]

## Public DTV data

We will use publicly available DTV data for Münster ([here](https://traffics.codeformuenster.org/)) to calibrate our model. In total there is 10 measuring stations. 

In [None]:
# DTV Data for comparison in the end
# Cars per Day

DTV=pd.read_csv(os.path.join(filepath, "dtv_muenster.csv"), sep=";")

DTV["dtv_weekend"]=DTV["dtv_weekend"].str.replace(" ","")
DTV["dtv_weekend"]=DTV["dtv_weekend"].str.replace(",",".").astype(float)
DTV["dtv_weekdays"]=DTV["dtv_weekdays"].str.replace(" ","")
DTV["dtv_weekdays"]=DTV["dtv_weekdays"].str.replace(",",".").astype(float)
DTV["dtv"]=DTV["dtv"].str.replace(" ","")
DTV["dtv"]=DTV["dtv"].str.replace(",",".").astype(float)

DTV

# PREPARE DTV DATA

## Retrieve the exact location of DTV measuring stations in Münster 

In [None]:
#Crossings, where DTV Data was measured
DTV["crossings"]=["Neutor / Wilhelmstr.", "Wolbecker Str. / Dortmunder Str.", "Hüfferstr. / Badestr.","Hammer Str. / Geiststr.", "Eisenbahnstr. / Eisenbahnstr.", "Gartenstr. / Bohlweg", "Warendorfer Str. / Piusallee", "Hafenstr. / Von-Steuben-Str.", "Weseler Str. / Kolde-Ring", "Hansaring / Albersloher Weg"]
DTV["crossings"]=DTV["crossings"].str.replace("Str.", "Straße")
DTV["crossings"]=DTV["crossings"].str.replace("str.", "straße")
coordinates=[Point(7.61508,51.96729),Point(7.64259,51.95656),Point(7.61474,51.96196),Point(7.6231,51.94485),Point(7.63521,51.96101),Point(7.63425,51.96564),Point(7.63631,51.96188),Point(7.63139,51.95398),Point(7.6166,51.94953),Point(7.63661,51.95289)]
DTV["geometry"]=coordinates
DTV.head(10)

In [None]:
DTV=gpd.GeoDataFrame(DTV)
DTV=DTV.set_crs(4326,allow_override=True)
DTV=DTV.to_crs(3857) # Set and change coordinate reference system
DTV.crs

In [None]:
# lets take a look at the location of measuring points
DTV.explore(color="red", tiles="Stamen TonerLite", marker_kwds={"radius": 5})

All measuring points lie within the city centre and on fairly big roads.

## Match the crossing (measuring station) to estimated traffic density (one road segment)

In [None]:
# Get the road segment corresponding to the crossing -> Use buffered roads
matched_roads_buffered=results.copy(deep=True)
matched_roads_buffered["geometry"] = matched_roads_buffered["geometry"].buffer(distance=15, cap_style=2)

# an efficient and concise way including total / weekday / weekend number of tracks is to use a spatial join
crossing_concise = gpd.sjoin(matched_roads_buffered, DTV, how="right", predicate="intersects")
crossing_concise.head()

In [None]:
# now we can group by name and get the average number of car tracks (total, weekend, weekday) (aka our estimated traffic density) per measuring point
# these will be the x values for our linear regression
est = crossing_concise.groupby(by="road")[["num_tracks_weekend", "num_tracks_weekday", "num_tracks_total"]].mean()
est

In [None]:
# for easier handling join the estimates back to the DTV data
# rename first
est.columns = ["num_tracks_weekend_single", "num_tracks_weekday_single", "num_tracks_total_single"]
DTV = DTV.join(est, on="road", how="left")
DTV

Here for 6 out of the 10 locations where measurements were performed the number of tracks on the corresponding road segment could be determined. Let's take a look at the unmatched points:

In [None]:
fig, ax=plt.subplots(1,1, figsize=(20,15))
results.loc[results["name"].isin(DTV["road"][1].split(" / "))].plot(ax=ax, color="blue")
#cx.add_basemap(ax=ax, crs=results.crs, source=cx.providers.OpenStreetMap.BlackAndWhite)
ax.annotate(".", xy=(DTV.geometry[1].x, DTV.geometry[1].y), xytext=(20, 20), color="None", textcoords="offset points",arrowprops=dict(arrowstyle="->",
                            connectionstyle="arc3", color="r"))
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 30))

DTV.plot(ax=ax[0], color="#1f78b4")
cx.add_basemap(ax=ax[0], crs= DTV.crs, source=cx.providers.Stamen.TonerLite)
results.plot(ax=ax[0], alpha=0.5, color="#b2df8a", linewidth=5)
DTV.plot(ax=ax[0], color="#1f78b4")


results.plot(ax=ax[1], color="#b2df8a", linewidth=3)
cx.add_basemap(ax=ax[1], crs= results.crs, source=cx.providers.Stamen.TonerLite)
DTV.plot(ax=ax[1], color="#1f78b4")

Clearly there have been tracks on this road. However not on the section we are interested in.


## Match the crossing (measuring station) to estimated traffic density (all road segments that correspond to both the roads on the crossing)

In [None]:
# some measuring points are at the intersections of two streets (e.g. "Promenade / Eisenbahnstraße")
# we need to split those entries into two street names and will save them as new column
DTV["single_street"] = DTV["crossings"].str.split(pat=" / ")
DTV["single_street"]

In [None]:
# now we can expand our df using the new column: if column has more than one entry, copy all other columns and create a new row
DTV_isol = DTV.explode(column="single_street")
DTV_isol

In [None]:
# this new feature we can now use to join results data on name, then group by original name (the one with "/") and get average track coutns
# because we are not joining index on index, we need to use merge instead of join
joined = results.merge(DTV_isol, how="right", left_on="name", right_on="single_street")

# grouping and aggregating same as above
est = joined.groupby(by="road")[["num_tracks_weekend", "num_tracks_weekday", "num_tracks_total"]].mean()
est

In [None]:
# rename and add results to DTV
est.columns = ["num_tracks_weekend_multi", "num_tracks_weekday_multi", "num_tracks_total_multi"]
DTV = DTV.join(est, on="road", how="left")
DTV

# EXECUTE LINEAR REGRESSION

## Single segment
The first linear model will use only the segment on which measuring station is located. We will compare models for total, weekend, weekday data and one combining all, so in total four models.

1. dtv = a + b * num_tracks_total_single
2. dtv_weekdays = c + d * num_tracks_weekday_single
3. dtv_weekend = e + f * num_tracks_weekend_single
4. dtv = g + h * num_tracks_weekday_single + i * num_tracks_weekend_single

In [None]:
# we need a hard copy because we need to drop stations without num_tracks (and later we need full data again)
data = DTV.copy()
data = data.dropna(subset=["num_tracks_total_single"])
data

In [None]:
from sklearn.linear_model import LinearRegression
# set up four model instances (see description above) and fit to data
# we need to give df instead of series, otherwise method will throw error
model_1 = LinearRegression()
model_1.fit(X=data[["num_tracks_total_single"]], y=data["dtv"])
data["model_1_pred"] = model_1.predict(data[["num_tracks_total_single"]])

model_2 = LinearRegression()
model_2.fit(X=data[["num_tracks_weekday_single"]], y=data["dtv_weekdays"])
data["model_2_pred"] = model_2.predict(data[["num_tracks_weekday_single"]])

model_3 = LinearRegression()
model_3.fit(X=data[["num_tracks_weekend_single"]], y=data["dtv_weekend"])
data["model_3_pred"] = model_3.predict(data[["num_tracks_weekend_single"]])

model_4 = LinearRegression()
model_4.fit(X=data[["num_tracks_weekday_single", "num_tracks_weekend_single"]], y=data["dtv"])
data["model_4_pred"] = model_4.predict(data[["num_tracks_weekday_single", "num_tracks_weekend_single"]])

In [None]:
data

In [None]:
# plot model and data
fig, ax = plt.subplots(2, 2, figsize=(15, 8))
ax = ax.flatten()
fig.suptitle("Models using a single street segment", fontsize=25)

# model 1
data.plot(x="num_tracks_total_single", y="dtv", ax=ax[0], kind="scatter", color="#1f78b4")
data.plot(x="num_tracks_total_single", y="model_1_pred", ax=ax[0], kind="line", color="#b2df8a", title="Model 1, r² = %.4f"%model_1.score(X=data[["num_tracks_total_single"]], y=data["dtv"]))

# model 2
data.plot(x="num_tracks_weekday_single", y="dtv_weekdays", ax=ax[1], kind="scatter", color="#1f78b4")
data.plot(x="num_tracks_weekday_single", y="model_2_pred", ax=ax[1], kind="line", color="#b2df8a", title="Model 2, r² = %.4f"%model_2.score(X=data[["num_tracks_weekday_single"]], y=data["dtv_weekdays"]))

# model 3
data.plot(x="num_tracks_weekend_single", y="dtv_weekend", ax=ax[2], kind="scatter", color="#1f78b4")
data.plot(x="num_tracks_weekend_single", y="model_3_pred", ax=ax[2], kind="line", color="#b2df8a", title="Model 3, r² = %.4f"%model_3.score(X=data[["num_tracks_weekend_single"]], y=data["dtv_weekend"]))

# model 4
data.plot(x="num_tracks_total_single", y="dtv", ax=ax[3], kind="scatter", color="#1f78b4")
data.plot(x="num_tracks_total_single", y="model_4_pred", ax=ax[3], kind="line", color="#b2df8a", title="Model 4, r² = %.4f"%model_4.score(X=data[["num_tracks_weekday_single", "num_tracks_weekend_single"]], y=data["dtv"]))

plt.tight_layout(pad=3.5)

## Multi segments
The models we will test are the same as above, just this time we will use our track data that was averaged over all street segments on streets that run through intersections (where the measuring station is located).

In [None]:
# again we need a hard copy because we need to drop stations without num_tracks
data_multi = DTV.copy()
data_multi = data.dropna(subset=["num_tracks_total_multi"])
data_multi

In [None]:
from sklearn.linear_model import LinearRegression
# set up four model instances (see description above) and fit to data
# we need to give df instead of series, otherwise method will throw error
model_5 = LinearRegression()
model_5.fit(X=data_multi[["num_tracks_total_multi"]], y=data_multi["dtv"])
data_multi["model_5_pred"] = model_5.predict(data_multi[["num_tracks_total_multi"]])

model_6 = LinearRegression()
model_6.fit(X=data_multi[["num_tracks_weekday_multi"]], y=data_multi["dtv_weekdays"])
data_multi["model_6_pred"] = model_6.predict(data_multi[["num_tracks_weekday_multi"]])

model_7 = LinearRegression()
model_7.fit(X=data_multi[["num_tracks_weekend_multi"]], y=data_multi["dtv_weekend"])
data_multi["model_7_pred"] = model_7.predict(data_multi[["num_tracks_weekend_multi"]])

model_8 = LinearRegression()
model_8.fit(X=data_multi[["num_tracks_weekday_multi", "num_tracks_weekend_multi"]], y=data_multi["dtv"])
data_multi["model_8_pred"] = model_8.predict(data_multi[["num_tracks_weekday_multi", "num_tracks_weekend_multi"]])

In [None]:
# plot model and data
fig, ax = plt.subplots(2, 2, figsize=(15, 8))
ax = ax.flatten()
fig.suptitle("Models using all street segments of streets on intersection", fontsize=25)

# model 5
data_multi.plot(x="num_tracks_total_multi", y="dtv", ax=ax[0], kind="scatter", color="#1f78b4")
data_multi.plot(x="num_tracks_total_multi", y="model_5_pred", ax=ax[0], kind="line", color="#b2df8a", title="Model 5, r² = %.4f"%model_5.score(X=data_multi[["num_tracks_total_multi"]], y=data_multi["dtv"]))

# model 6
data_multi.plot(x="num_tracks_weekday_multi", y="dtv_weekdays", ax=ax[1], kind="scatter", color="#1f78b4")
data_multi.plot(x="num_tracks_weekday_multi", y="model_6_pred", ax=ax[1], kind="line", color="#b2df8a", title="Model 6, r² = %.4f"%model_6.score(X=data_multi[["num_tracks_weekday_multi"]], y=data_multi["dtv_weekdays"]))

# model 7
data_multi.plot(x="num_tracks_weekend_multi", y="dtv_weekend", ax=ax[2], kind="scatter", color="#1f78b4")
data_multi.plot(x="num_tracks_weekend_multi", y="model_7_pred", ax=ax[2], kind="line", color="#b2df8a", title="Model 7, r² = %.4f"%model_7.score(X=data_multi[["num_tracks_weekend_multi"]], y=data_multi["dtv_weekend"]))

# model 8
data_multi.plot(x="num_tracks_total_multi", y="dtv", ax=ax[3], kind="scatter", color="#1f78b4")
data_multi.plot(x="num_tracks_total_multi", y="model_8_pred", ax=ax[3], kind="line", color="#b2df8a", title="Model 8, r² = %.4f"%model_8.score(X=data_multi[["num_tracks_weekday_multi", "num_tracks_weekend_multi"]], y=data_multi["dtv"]))

plt.tight_layout(pad=3.5)

# APPLY THE MODEL

Each street (also those with 0 tracks on them) has a DTV of n after applying the modell. Results will only be plotted for those streets with num_tracks > 0. We will use the best found model for total DTV so far, model 4.

In [None]:
# apply model
results["dtv_est"]=model_4.predict(results[["num_tracks_weekday", "num_tracks_weekend"]])
results.head(10)

In [None]:
fig, ax=plt.subplots(1,1,figsize=(10,10))
results.plot(ax=ax, column="dtv_est", cmap="Reds", legend=True)
cx.add_basemap(ax=ax, crs=results.crs, source=cx.providers.OpenStreetMap.BlackAndWhite)
ax.set_title("Schätzung des DTV anhand der linearen Regression", fontsize=15)

plt.tight_layout()
plt.show()

Apply model 1 (alsmost as good, but simpler):

In [None]:
# Apply model 1 (almost as good, but simpler):
results["dtv_est"]=model_4.predict(results[["num_tracks_weekday", "num_tracks_weekend"]])
results.head(10)

# PLot results:
fig, ax = plt.subplots(1, 1, figsize=(30, 10))

DTV.plot(ax=ax, color="#1f78b4")
cx.add_basemap(ax=ax, crs= DTV.crs, source=cx.providers.Stamen.TonerLite)
results.plot(ax=ax, alpha=0.5, column="dtv_est", cmap="Reds", vmax = 40000, legend=True, linewidth=5)
DTV.plot(ax=ax, color="#1f78b4")