In [1]:
import pandas as pd
import geopandas as gpd
from tqdm.notebook import tqdm

In [2]:
iris_input_file = "../data/raw/iris_2017/CONTOURS-IRIS.shp"
uber_spatial_input_file = "../data/raw/uber/paris_iris.json"
uber_monthly_input_file = "../data/raw/uber/paris-iris-2019-3-OnlyWeekdays-MonthlyAggregate.csv"
uber_hourly_input_file = "../data/raw/uber/paris-iris-2019-3-OnlyWeekdays-HourlyAggregate.csv"

daily_output_file = "../data/uber_daily.csv"
hourly_output_file = "../data/uber_hourly.csv"
zones_output_file = "../data/uber_zones.gpkg"
    
if "snakemake" in locals():
    iris_input_file = snakemake.input["iris"]
    uber_spatial_input_file = snakemake.input["spatial"]
    uber_monthly_input_file = snakemake.input["monthly"]
    uber_hourly_input_file = snakemake.input["hourly"]
    
    daily_output_file = snakemake.output["daily"]
    hourly_output_file = snakemake.output["hourly"]
    zones_output_file = snakemake.output["zones"]

In [27]:
# Load IRIS
df_iris = gpd.read_file(iris_input_file)
df_iris["municipality_id"] = df_iris["INSEE_COM"]
df_iris = df_iris[["municipality_id", "geometry"]]
df_iris = df_iris.to_crs("epsg:2154")

In [28]:
# Load Uber spatial data, the IDs are not the same so we need to figure out to which IRIS they match
df_spatial = gpd.read_file(uber_spatial_input_file)
df_spatial["uber_id"] = df_spatial["MOVEMENT_ID"].astype(int)
df_spatial = df_spatial[["uber_id", "geometry"]]
df_spatial = df_spatial.to_crs("epsg:2154")
df_spatial["geometry"] = df_spatial["geometry"].representative_point()

In [29]:
# Match the IDs
df_spatial = gpd.sjoin(df_spatial, df_iris, op = "within")
df_spatial = df_spatial[["uber_id", "municipality_id"]]
df_spatial

Unnamed: 0,uber_id,municipality_id
0,1,75101
1,2,75101
2,3,75101
3,4,75101
4,5,75101
...,...,...
5254,5255,95680
5255,5256,95680
5256,5257,95680
5257,5258,95682


In [6]:
assert len(df_spatial) == len(df_spatial["uber_id"].unique())

In [7]:
# Load monthly data
df_monthly = pd.read_csv(uber_monthly_input_file)
df_monthly = df_monthly.rename(columns = {
    "sourceid": "origin_uber_id",
    "dstid": "destination_uber_id",
    "mean_travel_time": "travel_time"
})[["origin_uber_id", "destination_uber_id", "travel_time"]]

df_monthly["origin_uber_id"] = df_monthly["origin_uber_id"].astype(int)
df_monthly["destination_uber_id"] = df_monthly["destination_uber_id"].astype(int)

In [8]:
# Merge in municipality information
df_monthly = pd.merge(df_monthly, df_spatial.rename(columns = {
    "uber_id": "origin_uber_id",
    "municipality_id": "origin_municipality_id"
}), on = "origin_uber_id", how = "inner")
                      
df_monthly = pd.merge(df_monthly, df_spatial.rename(columns = {
    "uber_id": "destination_uber_id",
    "municipality_id": "destination_municipality_id"
}), on = "destination_uber_id", how = "inner")

In [9]:
# Average over each OD pair
df_monthly = df_monthly.groupby([
    "origin_municipality_id", "destination_municipality_id"
]).mean().reset_index()

In [10]:
# Write output
df_monthly[[
    "origin_municipality_id", "destination_municipality_id", "travel_time"
]].to_csv(daily_output_file, sep = ";", index = False)

In [11]:
# Load hourly data
df_hourly = []

for df_chunk in tqdm(pd.read_csv(uber_hourly_input_file, usecols = [
    "sourceid", "dstid", "hod", "mean_travel_time"
], dtype = {
    "sourceid": int, "dstid": int, "hod": int, "mean_travel_time": float
}, chunksize = int(1e6)), total = 44):
    df_hourly.append(df_chunk)
    
df_hourly = pd.concat(df_hourly)

  0%|          | 0/44 [00:00<?, ?it/s]

In [12]:
df_hourly = df_hourly.rename(columns = {
    "sourceid": "origin_uber_id",
    "dstid": "destination_uber_id",
    "hod": "hour",
    "mean_travel_time": "travel_time"
})

In [13]:
# Find number of hours available for each pair
df_hours = df_hourly.groupby(["origin_uber_id", "destination_uber_id"]).size().reset_index(name = "hours")
df_hours = df_hours[df_hours["hours"] == 24]

In [14]:
df_hourly["temp"] = int(1e6) * df_hourly["origin_uber_id"] + df_hourly["destination_uber_id"]
df_hours["temp"] = int(1e6) * df_hours["origin_uber_id"] + df_hours["destination_uber_id"]

df_hourly = df_hourly[df_hourly["temp"].isin(df_hours["temp"].unique())]
df_hourly = df_hourly.drop(columns = ["temp"])

del df_hours

In [15]:
# Merge in municipality information
df_hourly = pd.merge(df_hourly, df_spatial.rename(columns = {
    "uber_id": "origin_uber_id",
    "municipality_id": "origin_municipality_id"
}), on = "origin_uber_id", how = "inner")
                      
df_hourly = pd.merge(df_hourly, df_spatial.rename(columns = {
    "uber_id": "destination_uber_id",
    "municipality_id": "destination_municipality_id"
}), on = "destination_uber_id", how = "inner")

In [16]:
# Average over each OD pair
df_hourly = df_hourly.groupby([
    "origin_municipality_id", "destination_municipality_id", "hour"
])["travel_time"].mean().reset_index(name = "travel_time")

In [17]:
# Write output
df_hourly[[
    "origin_municipality_id", "destination_municipality_id", "hour", "travel_time"
]].to_csv(hourly_output_file, sep = ";", index = False)

In [36]:
# Write municipalities for matching

df_municipalities = df_iris[df_iris["municipality_id"].isin(df_spatial["municipality_id"])].copy()
df_municipalities["municipality_id"] = df_municipalities["municipality_id"].astype(int)
df_municipalities.to_file(zones_output_file, driver = "GPKG")