In [2]:
# Import required libraries
import numpy as np, pandas as pd
from scipy import stats
from h3 import h3
from scipy import stats
import json
from shapely.ops import unary_union
from shapely.geometry import shape, Point, Polygon, MultiPolygon, LineString
import geopandas as gpd
from geopandas import GeoDataFrame
import warnings
import hashlib
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import itertools
from datetime import datetime


# Filter out annoying warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
#!wget https://www.duscloud.eu/s/pEGQo8yTKMpDd3G/download -O taxi_trips_with_weather_cleaned.csv
dataset = pd.read_csv("taxi_trips_with_weather_cleaned.csv")
dataset.head()

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [256]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3129456 entries, 0 to 3129455
Data columns (total 27 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   trip_id                     object 
 1   taxi_id                     object 
 2   timestamp_start             object 
 3   timestamp_end               object 
 4   duration                    float64
 5   distance                    float64
 6   fare                        float64
 7   tips                        float64
 8   tolls                       float64
 9   extras                      float64
 10  total                       float64
 11  payment_type                object 
 12  company                     object 
 13  pickup_lat                  float64
 14  pickup_long                 float64
 15  pickup_location             object 
 16  dropoff_lat                 float64
 17  dropoff_long                float64
 18  dropoff_location            object 
 19  temperature          

In [261]:
columns_to_keep = [
    "timestamp_start", "pickup_lat", "pickup_long",
    "temperature", "dew_point", "humidity", "wind_speed", "wind_gust", "pressure", "precipitation_rate"
]

# Copy the dataset and keep only the specified columns
d_c = dataset.copy()
d_c = d_c.loc[:, columns_to_keep]

# Display the first few rows of the new DataFrame to verify the changes
print(d_c.head())

       timestamp_start  pickup_lat  pickup_long  temperature  dew_point  \
0  2020-01-01 00:00:00   41.898332   -87.620763        -2.83      -5.33   
1  2020-01-01 00:00:00   41.892042   -87.631864        -2.83      -5.33   
2  2020-01-01 00:00:00   41.895033   -87.619711        -2.83      -5.33   
3  2020-01-01 00:00:00   41.965812   -87.655879        -2.83      -5.33   
4  2020-01-01 00:00:00   41.898332   -87.620763        -2.83      -5.33   

   humidity  wind_speed  wind_gust  pressure  precipitation_rate  
0      83.0        0.48       0.64    1.0088                 0.0  
1      83.0        0.48       0.64    1.0088                 0.0  
2      83.0        0.48       0.64    1.0088                 0.0  
3      83.0        0.48       0.64    1.0088                 0.0  
4      83.0        0.48       0.64    1.0088                 0.0  


In [262]:
# Convert pickup and dropoff locations to hexagons
d_c_8 = d_c.copy()
d_c_7 = d_c.copy()
d_c_6 = d_c.copy()
d_c_8['pickup_hex'] = d_c.apply(lambda row: h3.geo_to_h3(row['pickup_lat'], row['pickup_long'], resolution=8), axis=1)
d_c_7['pickup_hex'] = d_c.apply(lambda row: h3.geo_to_h3(row['pickup_lat'], row['pickup_long'], resolution=7), axis=1)
d_c_6['pickup_hex'] = d_c.apply(lambda row: h3.geo_to_h3(row['pickup_lat'], row['pickup_long'], resolution=6), axis=1)


# https://www.chicago.gov/content/dam/city/depts/cdph/clinic/general/WIC/WIC%20Clinics%20Holiday%20Hours%202020.pdf
public_holidays_2020 = [
    "2020-01-01",  # New Year's Day
    "2020-01-20",  # Martin Luther King Jr. Day
    "2020-02-12",  # Abraham Lincoln's Birthday
    "2020-02-17",  # Presidents' Day
    "2020-03-02",  # Casimir Pulaski Day
    "2020-05-25",  # Memorial Day
    "2020-07-04",  # Independence Day
    "2020-09-07",  # Labor Day
    "2020-10-12",  # Columbus Day
    "2020-11-11",  # Veterans Day
    "2020-11-26",  # Thanksgiving Day
    "2020-12-25"   # Christmas Day
]
public_holidays_2020 = [pd.to_datetime(date).date() for date in public_holidays_2020]

def addTimeFeatures(d_c, public_holidays_2020 = public_holidays_2020):
    # Convert timestamp columns to datetime
    d_c['timestamp_start'] = pd.to_datetime(d_c['timestamp_start'])
    
    # Extract time features
    d_c['hour_of_day'] = d_c['timestamp_start'].dt.hour
    d_c['day_of_week'] = d_c['timestamp_start'].dt.dayofweek
    d_c['date'] = d_c['timestamp_start'].dt.date
    d_c['IsWeekday'] = d_c['day_of_week'].apply(lambda x: 1 if x < 5 else 0)
    d_c['IsHoliday'] = d_c['date'].apply(lambda x: 1 if x in public_holidays_2020 else 0)
    
    d_c.drop(columns=['timestamp_start', 'pickup_lat', 'pickup_long'], inplace=True)

addTimeFeatures(d_c_8)
addTimeFeatures(d_c_7)
addTimeFeatures(d_c_6)

d_c_8.head()

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,pickup_hex,hour_of_day,day_of_week,date,IsWeekday,IsHoliday
0,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1
1,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664c1e7fffff,0,2,2020-01-01,1,1
2,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664c1ebfffff,0,2,2020-01-01,1,1
3,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664d897fffff,0,2,2020-01-01,1,1
4,-2.83,-5.33,83.0,0.48,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1


In [263]:
def weatherMean(d_c):
    d_c['temperature'] = d_c.groupby(['hour_of_day', 'date'])['temperature'].transform('mean')
    d_c['dew_point'] = d_c.groupby(['hour_of_day', 'date'])['dew_point'].transform('mean')
    d_c['humidity'] = d_c.groupby(['hour_of_day', 'date'])['humidity'].transform('mean')
    d_c['wind_speed'] = d_c.groupby(['hour_of_day', 'date'])['wind_speed'].transform('mean')
    d_c['pressure'] = d_c.groupby(['hour_of_day', 'date'])['pressure'].transform('mean')
    d_c['precipitation_rate'] = d_c.groupby(['hour_of_day', 'date'])['precipitation_rate'].transform('mean')

    return d_c

d_c_8 = weatherMean(d_c_8)
d_c_7 = weatherMean(d_c_7)
d_c_6 = weatherMean(d_c_6) 
d_c_8.head()

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,pickup_hex,hour_of_day,day_of_week,date,IsWeekday,IsHoliday
0,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1
1,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e7fffff,0,2,2020-01-01,1,1
2,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1ebfffff,0,2,2020-01-01,1,1
3,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664d897fffff,0,2,2020-01-01,1,1
4,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1


In [264]:
def calculateDemand(d_c):
    # Calculate demand per hexagon per hour per date
    taxi_demand = d_c.groupby(['pickup_hex', 'date', 'hour_of_day']).size().reset_index(name='demand')
    
    # Merge the demand back into the original DataFrame, dropping duplicates
    d_c = d_c.merge(taxi_demand, on=['pickup_hex', 'date', 'hour_of_day'], how='left')
    d_c = d_c.drop_duplicates(subset=['pickup_hex', 'date', 'hour_of_day'])

    return d_c

d_c_8 = calculateDemand(d_c_8)
d_c_7 = calculateDemand(d_c_7)
d_c_6 = calculateDemand(d_c_6)
d_c_8.head()

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,pickup_hex,hour_of_day,day_of_week,date,IsWeekday,IsHoliday,demand
0,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1,37
1,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e7fffff,0,2,2020-01-01,1,1,88
2,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1ebfffff,0,2,2020-01-01,1,1,16
3,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664d897fffff,0,2,2020-01-01,1,1,14
5,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664520bfffff,0,2,2020-01-01,1,1,6


In [137]:
"""
# save as pickle
with open('taxi_8.pkl', 'wb') as file:
    pickle.dump(d_c_8, file)

with open('taxi_7.pkl', 'wb') as file:
    pickle.dump(d_c_7, file)

with open('taxi_6.pkl', 'wb') as file:
    pickle.dump(d_c_6, file)

!curl -k -T taxi_6.pkl -u "TRZ9AAxbZGTsQ4t:Duracell_AAA_2024!" -H 'X-Requested-With: XMLHttpRequest' https://www.duscloud.eu/public.php/webdav/taxi_6.pkl

!curl -k -T taxi_7.pkl -u "TRZ9AAxbZGTsQ4t:Duracell_AAA_2024!" -H 'X-Requested-With: XMLHttpRequest' https://www.duscloud.eu/public.php/webdav/taxi_7.pkl

!curl -k -T taxi_8.pkl -u "TRZ9AAxbZGTsQ4t:Duracell_AAA_2024!" -H 'X-Requested-With: XMLHttpRequest' https://www.duscloud.eu/public.php/webdav/taxi_8.pkl
"""

In [23]:
"""
with open('taxi_8.pkl', 'rb') as file:
    d_c_8 = pickle.load(file)
with open('taxi_7.pkl', 'rb') as file:
    d_c_7 = pickle.load(file)
with open('taxi_6.pkl', 'rb') as file:
    d_c_6 = pickle.load(file)

d_c_8.head()
"""

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,pickup_hex,hour_of_day,day_of_week,date,IsWeekday,IsHoliday,demand
0,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e9fffff,0,2,2020-01-01,1,1,37
1,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1e7fffff,0,2,2020-01-01,1,1,88
2,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664c1ebfffff,0,2,2020-01-01,1,1,16
3,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664d897fffff,0,2,2020-01-01,1,1,14
5,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,882664520bfffff,0,2,2020-01-01,1,1,6


## ADD POI

In [43]:
with open('hex_size_8_pois.pkl', 'rb') as file:
    df_poi_8 = pickle.load(file)
with open('hex_size_7_pois.pkl', 'rb') as file:
    df_poi_7 = pickle.load(file)
with open('POI_Hex6.pkl', 'rb') as file:
    df_poi_6 = pickle.load(file)

In [57]:
merged_df_8 = pd.merge(d_c_8, df_poi_8, how='left', left_on='pickup_hex', right_on='hex_id')
merged_df_7 = pd.merge(d_c_7, df_poi_7, how='left', left_on='pickup_hex', right_on='hex_id')
merged_df_6 = pd.merge(d_c_6, df_poi_6, how='left', left_on='pickup_hex', right_on='hex_id_6')
merged_df_6.rename(columns={'hex_id_6': 'hex_id'}, inplace=True)

merged_df_8 = merged_df_8.drop(columns=["pickup_hex", "geometry", "community"])
merged_df_7 = merged_df_7.drop(columns=["pickup_hex", "geometry", "community"])
merged_df_6 = merged_df_6.drop(columns=["pickup_hex"])

In [58]:
merged_df_8.isnull().sum()

temperature           0
dew_point             0
humidity              0
wind_speed            0
wind_gust             0
pressure              0
precipitation_rate    0
hour_of_day           0
day_of_week           0
date                  0
IsWeekday             0
IsHoliday             0
demand                0
hex_id                0
NO_AMENITY            0
bar                   0
cafe                  0
cinema                0
clinic                0
library               0
restaurant            0
school                0
theatre               0
university            0
dtype: int64

In [59]:
merged_df_7.head()

Unnamed: 0,temperature,dew_point,humidity,wind_speed,wind_gust,pressure,precipitation_rate,hour_of_day,day_of_week,date,...,NO_AMENITY,bar,cafe,cinema,clinic,library,restaurant,school,theatre,university
0,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,0,101,111,3,4,1,295,3,3,1
1,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,0,11,17,0,4,0,52,4,1,0
2,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,0,2,1,0,0,0,1,4,0,0
3,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,0,28,14,1,4,0,80,3,2,0
4,-3.037468,-5.522874,83.291892,0.21045,0.64,1.0088,0.0,0,2,2020-01-01,...,0,39,31,0,1,0,138,5,1,0


In [60]:
merged_df_8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340874 entries, 0 to 340873
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   temperature         340874 non-null  float64
 1   dew_point           340874 non-null  float64
 2   humidity            340874 non-null  float64
 3   wind_speed          340874 non-null  float64
 4   wind_gust           340874 non-null  float64
 5   pressure            340874 non-null  float64
 6   precipitation_rate  340874 non-null  float64
 7   hour_of_day         340874 non-null  int32  
 8   day_of_week         340874 non-null  int32  
 9   date                340874 non-null  object 
 10  IsWeekday           340874 non-null  int64  
 11  IsHoliday           340874 non-null  int64  
 12  demand              340874 non-null  int64  
 13  hex_id              340874 non-null  object 
 14  NO_AMENITY          340874 non-null  int64  
 15  bar                 340874 non-nul

In [61]:
with open('merged_df_8.pkl', 'wb') as file:
    pickle.dump(merged_df_8, file)

with open('merged_df_7.pkl', 'wb') as file:
    pickle.dump(merged_df_7, file)

with open('merged_df_6.pkl', 'wb') as file:
    pickle.dump(merged_df_6, file)