In [124]:
# Import required libraries
import numpy as np, pandas as pd
from scipy import stats
from h3 import h3
from scipy import stats
import json
from shapely.ops import unary_union
from shapely.geometry import shape, Point, Polygon, MultiPolygon, LineString
import geopandas as gpd
from geopandas import GeoDataFrame
import warnings
import hashlib
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import itertools
from datetime import datetime


# Filter out annoying warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [126]:
#!wget https://www.duscloud.eu/s/pEGQo8yTKMpDd3G/download -O taxi_trips_with_weather_cleaned.csv
dataset = pd.read_csv("taxi_trips_with_weather_cleaned.csv")
dataset.head()

Unnamed: 0,trip_id,taxi_id,timestamp_start,timestamp_end,duration,distance,fare,tips,tolls,extras,...,dropoff_long,dropoff_location,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,precipitation_accumulation
0,0cd46aa628ea5f871630113eb8ba92e4,09a6eaba341c43fc653756deff43a50f,2020-01-01 00:00:00,2020-01-01 00:15:00,180.0,0.64,4.5,0.0,0.0,1.0,...,-87.626211,POINT (-87.6262105324 41.8991556134),-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,0.0
1,4fbe84436a116f11c51ef3306e7adf0c,0f66b306ebea0f05dd124bf958729b64,2020-01-01 00:00:00,2020-01-01 00:30:00,1723.0,1.13,12.0,0.0,0.0,0.0,...,-87.626215,POINT (-87.6262149064 41.8925077809),-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,0.0
2,08a197de2703972bad8877628b18bb30,5a3410255ea0041b967e74090838b14d,2020-01-01 00:00:00,2020-01-01 00:00:00,420.0,0.97,6.0,0.0,0.0,1.0,...,-87.618868,POINT (-87.6188683546 41.8909220259),-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,0.0
3,94f8c8a4624a061988e0f32e12be04c5,3049f212da83687eb7ea5cb9d3046d89,2020-01-01 00:00:00,2020-01-01 00:30:00,1320.0,8.21,16.75,3.85,0.0,2.0,...,-87.676356,POINT (-87.6763559892 41.9012069941),-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,0.0
4,dd25d5229b1ce93ea03b7c9357e6f4f9,d9d9d16f34141de38105a060abd8a90c,2020-01-01 00:00:00,2020-01-01 00:15:00,504.0,1.19,6.25,0.0,0.0,1.0,...,-87.626215,POINT (-87.6262149064 41.8925077809),-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,0.0


In [128]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3129456 entries, 0 to 3129455
Data columns (total 27 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   trip_id                     object 
 1   taxi_id                     object 
 2   timestamp_start             object 
 3   timestamp_end               object 
 4   duration                    float64
 5   distance                    float64
 6   fare                        float64
 7   tips                        float64
 8   tolls                       float64
 9   extras                      float64
 10  total                       float64
 11  payment_type                object 
 12  company                     object 
 13  pickup_lat                  float64
 14  pickup_long                 float64
 15  pickup_location             object 
 16  dropoff_lat                 float64
 17  dropoff_long                float64
 18  dropoff_location            object 
 19  temperature          

In [129]:
columns_to_keep = [
    "timestamp_start", "pickup_lat", "pickup_long",
    "temperature", "dew_point", "humidity", "wind_speed", "wind_gust", "pressure", "precipitation_rate"
]

# Copy the dataset and keep only the specified columns
d_c = dataset.copy()
d_c = d_c.loc[:, columns_to_keep]

# Display the first few rows of the new DataFrame to verify the changes
print(d_c.head())

       timestamp_start  pickup_lat  pickup_long  temperature  dew_point  \
0  2020-01-01 00:00:00   41.898332   -87.620763        -2.83      -5.33   
1  2020-01-01 00:00:00   41.892042   -87.631864        -2.83      -5.33   
2  2020-01-01 00:00:00   41.895033   -87.619711        -2.83      -5.33   
3  2020-01-01 00:00:00   41.965812   -87.655879        -2.83      -5.33   
4  2020-01-01 00:00:00   41.898332   -87.620763        -2.83      -5.33   

   humidity  wind_speed  wind_gust  pressure  precipitation_rate  
0      83.0        0.48       0.64    1.0088                 0.0  
1      83.0        0.48       0.64    1.0088                 0.0  
2      83.0        0.48       0.64    1.0088                 0.0  
3      83.0        0.48       0.64    1.0088                 0.0  
4      83.0        0.48       0.64    1.0088                 0.0  


In [130]:
# Convert pickup and dropoff locations to hexagons
d_c_8 = d_c.copy()
d_c_7 = d_c.copy()
d_c_6 = d_c.copy()
d_c_8['pickup_hex'] = d_c.apply(lambda row: h3.geo_to_h3(row['pickup_lat'], row['pickup_long'], resolution=8), axis=1)
d_c_7['pickup_hex'] = d_c.apply(lambda row: h3.geo_to_h3(row['pickup_lat'], row['pickup_long'], resolution=7), axis=1)
d_c_6['pickup_hex'] = d_c.apply(lambda row: h3.geo_to_h3(row['pickup_lat'], row['pickup_long'], resolution=6), axis=1)

# https://www.chicago.gov/content/dam/city/depts/cdph/clinic/general/WIC/WIC%20Clinics%20Holiday%20Hours%202020.pdf
public_holidays_2020 = [
    "2020-01-01",  # New Year's Day
    "2020-01-20",  # Martin Luther King Jr. Day
    "2020-02-12",  # Abraham Lincoln's Birthday
    "2020-02-17",  # Presidents' Day
    "2020-03-02",  # Casimir Pulaski Day
    "2020-05-25",  # Memorial Day
    "2020-07-04",  # Independence Day
    "2020-09-07",  # Labor Day
    "2020-10-12",  # Columbus Day
    "2020-11-11",  # Veterans Day
    "2020-11-26",  # Thanksgiving Day
    "2020-12-25"   # Christmas Day
]
public_holidays_2020 = [pd.to_datetime(date).date() for date in public_holidays_2020]

def addTimeFeatures(d_c, public_holidays_2020 = public_holidays_2020):
    # Convert timestamp columns to datetime
    d_c['timestamp_start'] = pd.to_datetime(d_c['timestamp_start'])
    
    # Extract time features
    d_c['hour_of_day'] = d_c['timestamp_start'].dt.hour
    d_c['day_of_week'] = d_c['timestamp_start'].dt.dayofweek
    d_c['date'] = d_c['timestamp_start'].dt.date
    d_c['IsWeekday'] = d_c['day_of_week'].apply(lambda x: 1 if x < 5 else 0)
    d_c['IsHoliday'] = d_c['date'].apply(lambda x: 1 if x in public_holidays_2020 else 0)
    
    d_c.drop(columns=['timestamp_start', 'pickup_lat', 'pickup_long'], inplace=True)

addTimeFeatures(d_c_8)
addTimeFeatures(d_c_7)
addTimeFeatures(d_c_6)

d_c_8.head()

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,pickup_hex,hour_of_day,day_of_week,date,IsWeekday,IsHoliday
0,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1
1,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664c1e7fffff,0,2,2020-01-01,1,1
2,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664c1ebfffff,0,2,2020-01-01,1,1
3,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664d897fffff,0,2,2020-01-01,1,1
4,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1


In [131]:
def weatherMean(d_c):
    d_c['temperature'] = d_c.groupby(['hour_of_day', 'date'])['temperature'].transform('mean')
    d_c['dew_point'] = d_c.groupby(['hour_of_day', 'date'])['dew_point'].transform('mean')
    d_c['humidity'] = d_c.groupby(['hour_of_day', 'date'])['humidity'].transform('mean')
    d_c['wind_speed'] = d_c.groupby(['hour_of_day', 'date'])['wind_speed'].transform('mean')
    d_c['pressure'] = d_c.groupby(['hour_of_day', 'date'])['pressure'].transform('mean')
    d_c['precipitation_rate'] = d_c.groupby(['hour_of_day', 'date'])['precipitation_rate'].transform('mean')

    return d_c

d_c_8 = weatherMean(d_c_8)
d_c_7 = weatherMean(d_c_7)
d_c_6 = weatherMean(d_c_6) 
d_c_8.head()

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,pickup_hex,hour_of_day,day_of_week,date,IsWeekday,IsHoliday
0,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1
1,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e7fffff,0,2,2020-01-01,1,1
2,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1ebfffff,0,2,2020-01-01,1,1
3,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664d897fffff,0,2,2020-01-01,1,1
4,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1


In [136]:
def calculateDemand(d_c):
    # Calculate demand per hexagon per hour per date
    taxi_demand = d_c.groupby(['pickup_hex', 'date', 'hour_of_day']).size().reset_index(name='demand')
    
    # Merge the demand back into the original DataFrame, dropping duplicates
    d_c = d_c.merge(taxi_demand, on=['pickup_hex', 'date', 'hour_of_day'], how='left')
    d_c = d_c.drop_duplicates(subset=['pickup_hex', 'date', 'hour_of_day'])

    return d_c

d_c_8 = calculateDemand(d_c_8)
d_c_7 = calculateDemand(d_c_7)
d_c_6 = calculateDemand(d_c_6)
d_c_8.head()

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,pickup_hex,hour_of_day,day_of_week,date,IsWeekday,IsHoliday,demand
0,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1,37
1,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e7fffff,0,2,2020-01-01,1,1,88
2,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1ebfffff,0,2,2020-01-01,1,1,16
3,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664d897fffff,0,2,2020-01-01,1,1,14
5,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664520bfffff,0,2,2020-01-01,1,1,6


In [137]:
# save as pickle
with open('taxi_8.pkl', 'wb') as file:
    pickle.dump(d_c_8, file)

with open('taxi_7.pkl', 'wb') as file:
    pickle.dump(d_c_7, file)

with open('taxi_6.pkl', 'wb') as file:
    pickle.dump(d_c_6, file)


In [144]:
!curl -k -T taxi_6.pkl -u "TRZ9AAxbZGTsQ4t:Duracell_AAA_2024!" -H 'X-Requested-With: XMLHttpRequest' https://www.duscloud.eu/public.php/webdav/taxi_6.pkl

!curl -k -T taxi_7.pkl -u "TRZ9AAxbZGTsQ4t:Duracell_AAA_2024!" -H 'X-Requested-With: XMLHttpRequest' https://www.duscloud.eu/public.php/webdav/taxi_7.pkl

!curl -k -T taxi_8.pkl -u "TRZ9AAxbZGTsQ4t:Duracell_AAA_2024!" -H 'X-Requested-With: XMLHttpRequest' https://www.duscloud.eu/public.php/webdav/taxi_8.pkl



## ADD POI

In [240]:
with open('8_pois.pkl', 'rb') as file:
    df_poi_8 = pickle.load(file)
with open('7_pois.pkl', 'rb') as file:
    df_poi_7 = pickle.load(file)
with open('6_pois.pkl', 'rb') as file:
    df_poi_6 = pickle.load(file)

In [241]:
df_poi_8.head()

Unnamed: 0,geometry,community,NO_AMENITY,bar,cafe,cinema,clinic,library,restaurant,school,theatre,university
0,"POLYGON ((-87.74020 41.74101, -87.74621 41.739...",ASHBURN,1,0,0,0,0,0,0,0,0,0
1,"POLYGON ((-87.79815 41.78150, -87.80417 41.779...",CLEARING,1,0,0,0,0,0,0,0,0,0
2,"POLYGON ((-87.79150 41.78796, -87.79752 41.786...",CLEARING,1,0,0,0,0,0,0,0,0,0
3,"POLYGON ((-87.78675 41.78048, -87.79277 41.778...",CLEARING,0,0,0,0,0,0,1,0,0,0
4,"POLYGON ((-87.73354 41.74746, -87.73956 41.745...",ASHBURN,1,0,0,0,0,0,0,0,0,0


In [242]:
def polygon_to_hex(polygon, resolution):
    # Get the centroid of the polygon
    centroid = polygon.centroid
    # Convert the centroid to an H3 hexagon ID
    hex_id = h3.geo_to_h3(centroid.y, centroid.x, resolution)
    return hex_id

# Apply the function to the 'geometry' column
df_poi_8['hex_id'] = df_poi_8['geometry'].apply(polygon_to_hex, resolution = 8)
df_poi_7['hex_id'] = df_poi_7['geometry'].apply(polygon_to_hex, resolution = 7)
df_poi_6['hex_id'] = df_poi_6['geometry'].apply(polygon_to_hex, resolution = 6)

df_poi_6.head()

Unnamed: 0,geometry,community,bar,cafe,cinema,clinic,library,restaurant,school,theatre,university,hex_id
0,"POLYGON ((-87.70793 41.72398, -87.75002 41.711...",MORGAN PARK,5,1,0,0,0,1,2,0,0,862664577ffffff
1,"POLYGON ((-87.69449 41.82154, -87.73667 41.808...",CHICAGO LAWN,11,4,0,4,1,15,20,0,0,862664cd7ffffff
2,"POLYGON ((-87.77440 41.82876, -87.81657 41.816...",CLEARING,7,9,0,0,0,10,9,0,0,862664527ffffff
3,"POLYGON ((-87.76108 41.92644, -87.80334 41.913...",WEST GARFIELD PARK,4,3,0,1,1,2,27,0,0,862664c87ffffff
4,"POLYGON ((-87.79449 41.97892, -87.83678 41.966...",AUSTIN,15,14,0,2,2,13,16,1,0,862664cb7ffffff


In [243]:
merged_df_8 = pd.merge(d_c_8, df_poi_8, how='left', left_on='pickup_hex', right_on='hex_id')
merged_df_7 = pd.merge(d_c_7, df_poi_7, how='left', left_on='pickup_hex', right_on='hex_id')
merged_df_6 = pd.merge(d_c_6, df_poi_6, how='left', left_on='pickup_hex', right_on='hex_id')

merged_df_8 = merged_df_8.drop(columns=["community", "geometry", "pickup_hex"])
merged_df_7 = merged_df_7.drop(columns=["community", "geometry", "pickup_hex"])
merged_df_6 = merged_df_6.drop(columns=["community", "geometry", "pickup_hex"])

In [244]:
merged_df_8

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,hour_of_day,day_of_week,date,...,bar,cafe,cinema,clinic,library,restaurant,school,theatre,university,hex_id
0,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,,,,,,,,,,
1,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,35.0,17.0,0.0,0.0,0.0,94.0,0.0,0.0,1.0,882664c1e7fffff
2,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,4.0,9.0,1.0,1.0,1.0,14.0,1.0,0.0,0.0,882664c1ebfffff
3,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,3.0,4.0,0.0,1.0,0.0,12.0,1.0,0.0,0.0,882664d897fffff
4,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,882664520bfffff
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340869,-2.940000,-7.500000,71.000000,0.16000,0.16,1.0030,0.0,23,3,2020-12-31,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,882664cc4dfffff
340870,-2.940000,-7.500000,71.000000,0.16000,0.16,1.0030,0.0,23,3,2020-12-31,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,882664cce7fffff
340871,-2.940000,-7.500000,71.000000,0.16000,0.16,1.0030,0.0,23,3,2020-12-31,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,882664c80bfffff
340872,-2.940000,-7.500000,71.000000,0.16000,0.16,1.0030,0.0,23,3,2020-12-31,...,0.0,1.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,882664cf55fffff


In [247]:
merged_df_8[merged_df_8["bar"].isna()]

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,hour_of_day,day_of_week,date,...,bar,cafe,cinema,clinic,library,restaurant,school,theatre,university,hex_id
0,-3.037468,-5.522874,83.291892,0.210450,0.64,1.008800,0.0,0,2,2020-01-01,...,,,,,,,,,,
46,-3.037468,-5.522874,83.291892,0.210450,0.32,1.008800,0.0,0,2,2020-01-01,...,,,,,,,,,,
62,-3.599012,-5.952331,83.721070,0.031354,0.32,1.008800,0.0,1,2,2020-01-01,...,,,,,,,,,,
92,-3.599012,-5.952331,83.721070,0.031354,0.00,1.008800,0.0,1,2,2020-01-01,...,,,,,,,,,,
139,-4.005184,-6.197798,85.000000,0.000000,0.00,1.009533,0.0,2,2,2020-01-01,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340325,-2.940000,-7.500000,71.000000,0.160000,0.16,1.003000,0.0,12,3,2020-12-31,...,,,,,,,,,,
340371,-2.940000,-7.500000,71.000000,0.160000,0.16,1.003000,0.0,13,3,2020-12-31,...,,,,,,,,,,
340447,-2.940000,-7.500000,71.000000,0.160000,0.16,1.003000,0.0,14,3,2020-12-31,...,,,,,,,,,,
340528,-2.940000,-7.500000,71.000000,0.160000,0.16,1.003000,0.0,16,3,2020-12-31,...,,,,,,,,,,


In [249]:
merged_df_7.isnull().sum()

temperature              0
dew_point                0
humidity                 0
wind_speed               0
wind_gust                0
pressure                 0
precipitation_rate       0
hour_of_day              0
day_of_week              0
date                     0
IsWeekday                0
IsHoliday                0
demand                   0
NO_AMENITY            8423
bar                   8423
cafe                  8423
cinema                8423
clinic                8423
library               8423
restaurant            8423
school                8423
theatre               8423
university            8423
hex_id                8423
dtype: int64

In [194]:
merged_df_8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406797 entries, 0 to 406796
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   temperature         406797 non-null  float64
 1   dew_point           406797 non-null  float64
 2   humidity            406797 non-null  float64
 3   wind_speed          406797 non-null  float64
 4   wind_gust           406797 non-null  float64
 5   pressure            406797 non-null  float64
 6   precipitation_rate  406797 non-null  float64
 7   hour_of_day         406797 non-null  int32  
 8   day_of_week         406797 non-null  int32  
 9   date                406797 non-null  object 
 10  IsWeekday           406797 non-null  int64  
 11  IsHoliday           406797 non-null  int64  
 12  demand              406797 non-null  int64  
 13  NO_AMENITY          403076 non-null  float64
 14  bar                 403076 non-null  float64
 15  cafe                403076 non-nul

In [150]:
with open('merged_8.pkl', 'wb') as file:
    pickle.dump(merged_df_8, file)