In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
from pathlib import Path

In [2]:
from bikesharing.ml_logic.data import get_raw_data, get_weather_data, get_polygons
from bikesharing.ml_logic.encoders import encode_district_label,encode_temporal_features
from bikesharing.ml_logic.preprocessor import group_rental_data_by_hour,preprocess_features
from bikesharing.ml_logic.feature_engineering import is_holiday, is_weekend, feature_selection
from bikesharing.interface.main import preprocess
from bikesharing.params import *

In [3]:
query =f'''
    SELECT *
    FROM `{GCP_PROJECT}.{BQ_DATASET}.raw_data_mvg`
'''

rental_data_df = get_raw_data(gcp_project=GCP_PROJECT , query=query , cache_path=Path(f'{LOCAL_DATA_PATH}/raw/mvg_rentals_from_{START_YEAR}_to_{END_YEAR}.csv'))

[34m
Load data from local CSV...[0m


  df = pd.read_csv(cache_path, header='infer' if data_has_header else None)


✅ Data loaded, with shape (2804147, 10)


In [4]:
rental_relavent_cols_df = rental_data_df[['STARTTIME' , 'STARTLAT' , 'STARTLON']]
rental_relavent_cols_df = rental_relavent_cols_df.drop_duplicates()

In [5]:
encoded_df = pd.read_csv('../raw_data/encode_df.csv')
encoded_df.drop(columns=['Unnamed: 0'] , inplace=True)

In [6]:
grouped_df = group_rental_data_by_hour(encoded_df)
grouped_df

Unnamed: 0,rent_date_hour,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,...,Schwanthalerhöhe,Sendling,Sendling-Westpark,Südgiesing,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach
0,2019-01-01 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-01-01 01:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,2019-01-01 02:00:00,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,2019-01-01 03:00:00,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2019-01-01 04:00:00,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34627,2022-12-31 19:00:00,5.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,3.0,5.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0
34628,2022-12-31 20:00:00,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
34629,2022-12-31 21:00:00,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34630,2022-12-31 22:00:00,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [7]:
weather_data_df = get_weather_data(cache_path=Path(f'{LOCAL_DATA_PATH}/raw/histotical_weather_data_{START_YEAR}_to_{END_YEAR}.csv'))
weather_data_df['time'] = pd.to_datetime(weather_data_df['time'])
weather_data_df

[34m
Load data from local CSV...[0m
✅ Data loaded, with shape (35064, 6)


Unnamed: 0,time,temperature_2m,relativehumidity_2m,apparent_temperature,windspeed_10m,precipitation
0,2019-01-01 00:00:00,3.3,100,0.5,9.0,0.2
1,2019-01-01 01:00:00,3.4,99,0.4,9.7,0.1
2,2019-01-01 02:00:00,3.5,100,0.2,12.0,0.2
3,2019-01-01 03:00:00,3.5,99,0.0,13.5,0.1
4,2019-01-01 04:00:00,3.5,100,-0.0,14.1,0.0
...,...,...,...,...,...,...
35059,2022-12-31 19:00:00,6.5,83,3.9,8.0,0.0
35060,2022-12-31 20:00:00,5.9,83,3.4,6.8,0.0
35061,2022-12-31 21:00:00,5.8,81,3.1,7.2,0.0
35062,2022-12-31 22:00:00,6.1,78,3.1,8.8,0.0


In [8]:
merged_df = grouped_df.merge(weather_data_df, right_on='time' , left_on='rent_date_hour' , how='outer')
merged_df['rent_date_hour'] = merged_df['time']
merged_df = merged_df.sort_values(by='rent_date_hour').drop(columns=['time'])

In [9]:
merged_df

Unnamed: 0,rent_date_hour,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,...,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach,temperature_2m,relativehumidity_2m,apparent_temperature,windspeed_10m,precipitation
0,2019-01-01 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.3,100,0.5,9.0,0.2
1,2019-01-01 01:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,3.4,99,0.4,9.7,0.1
2,2019-01-01 02:00:00,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,3.5,100,0.2,12.0,0.2
3,2019-01-01 03:00:00,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,3.5,99,0.0,13.5,0.1
4,2019-01-01 04:00:00,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,100,-0.0,14.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34627,2022-12-31 19:00:00,5.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,2.0,2.0,0.0,6.5,83,3.9,8.0,0.0
34628,2022-12-31 20:00:00,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,5.9,83,3.4,6.8,0.0
34629,2022-12-31 21:00:00,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.8,81,3.1,7.2,0.0
34630,2022-12-31 22:00:00,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,6.1,78,3.1,8.8,0.0


In [10]:
holidays = is_holiday(merged_df[['rent_date_hour']])
merged_df = merged_df.merge(holidays , on='rent_date_hour' , how='inner')

weekends = is_weekend(merged_df[['rent_date_hour']])
merged_df = merged_df.merge(weekends , on='rent_date_hour' , how='inner')

encoded_date = encode_temporal_features(merged_df[['rent_date_hour']])
merged_df = merged_df.merge(encoded_date , on='rent_date_hour' , how='inner')

In [14]:
merged_df

Unnamed: 0,rent_date_hour,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,...,windspeed_10m,precipitation,is_holiday,is_weekend,hour_sin,hour_cos,month_sin,month_cos,day_sin,day_cos
0,2019-01-01 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,0.2,1,0,2.588190e-01,0.965926,5.000000e-01,0.866025,2.012985e-01,0.97953
1,2019-01-01 01:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,9.7,0.1,1,0,5.000000e-01,0.866025,5.000000e-01,0.866025,2.012985e-01,0.97953
2,2019-01-01 02:00:00,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,12.0,0.2,1,0,7.071068e-01,0.707107,5.000000e-01,0.866025,2.012985e-01,0.97953
3,2019-01-01 03:00:00,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.5,0.1,1,0,8.660254e-01,0.500000,5.000000e-01,0.866025,2.012985e-01,0.97953
4,2019-01-01 04:00:00,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.1,0.0,1,0,9.659258e-01,0.258819,5.000000e-01,0.866025,2.012985e-01,0.97953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,2022-12-31 19:00:00,5.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,8.0,0.0,0,1,-8.660254e-01,0.500000,-2.449294e-16,1.000000,-2.449294e-16,1.00000
35060,2022-12-31 20:00:00,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,6.8,0.0,0,1,-7.071068e-01,0.707107,-2.449294e-16,1.000000,-2.449294e-16,1.00000
35061,2022-12-31 21:00:00,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,7.2,0.0,0,1,-5.000000e-01,0.866025,-2.449294e-16,1.000000,-2.449294e-16,1.00000
35062,2022-12-31 22:00:00,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,8.8,0.0,0,1,-2.588190e-01,0.965926,-2.449294e-16,1.000000,-2.449294e-16,1.00000


In [29]:
features = ['Altstadt-Lehel', 'Au - Haidhausen',
       'Aubing-Lochhausen-Langwied', 'Berg am Laim', 'Bogenhausen',
       'Feldmoching', 'Hadern', 'Harlaching', 'Hasenbergl-Lerchenau Ost',
       'Laim', 'Lochhausen', 'Ludwigsvorstadt-Isarvorstadt', 'Maxvorstadt',
       'Milbertshofen-Am Hart', 'Moosach', 'Neuhausen-Nymphenburg',
       'Obergiesing', 'Obermenzing', 'Obersendling', 'Pasing',
       'Pasing-Obermenzing', 'Ramersdorf-Perlach', 'Schwabing-Freimann',
       'Schwabing-West', 'Schwanthalerhöhe', 'Sendling', 'Sendling-Westpark',
       'Südgiesing', 'Thalkirchen', 'Trudering', 'Trudering-Riem',
       'Untergiesing', 'Untergiesing-Harlaching', 'Untermenzing-Allach',
       'temperature_2m', 'relativehumidity_2m', 'apparent_temperature',
       'windspeed_10m', 'precipitation', 'is_holiday', 'is_weekend',
       'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos']
selected_merged_df = feature_selection(merged_df , features)

In [30]:
preproc_df = pd.DataFrame(preprocess_features(selected_merged_df))

✅ X_processed, with shape (35064, 47)


In [31]:
preproc_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,0.013333,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.227848,0.017391,1.0,0.0,0.629410,0.982963,0.75,0.933013,0.600779,0.989739
1,0.000000,0.000000,0.0,0.000000,0.051282,0.0,0.000000,0.0,0.0,0.000000,...,0.245570,0.008696,1.0,0.0,0.750000,0.933013,0.75,0.933013,0.600779,0.989739
2,0.013333,0.028571,0.0,0.000000,0.051282,0.0,0.000000,0.0,0.0,0.125000,...,0.303797,0.017391,1.0,0.0,0.853553,0.853553,0.75,0.933013,0.600779,0.989739
3,0.000000,0.114286,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.041667,...,0.341772,0.008696,1.0,0.0,0.933013,0.750000,0.75,0.933013,0.600779,0.989739
4,0.026667,0.028571,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.041667,...,0.356962,0.000000,1.0,0.0,0.982963,0.629410,0.75,0.933013,0.600779,0.989739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,0.066667,0.114286,0.0,0.071429,0.025641,0.0,0.066667,0.0,0.0,0.000000,...,0.202532,0.000000,0.0,1.0,0.066987,0.750000,0.50,1.000000,0.500000,1.000000
35060,0.053333,0.028571,0.0,0.000000,0.025641,0.0,0.000000,0.0,0.0,0.000000,...,0.172152,0.000000,0.0,1.0,0.146447,0.853553,0.50,1.000000,0.500000,1.000000
35061,0.000000,0.085714,0.0,0.000000,0.025641,0.0,0.000000,0.0,0.0,0.000000,...,0.182278,0.000000,0.0,1.0,0.250000,0.933013,0.50,1.000000,0.500000,1.000000
35062,0.066667,0.085714,0.0,0.000000,0.128205,0.0,0.000000,0.0,0.0,0.000000,...,0.222785,0.000000,0.0,1.0,0.370590,0.982963,0.50,1.000000,0.500000,1.000000


In [3]:
preprocess()

[34m
Preprocessing Data...[0m
[34m
Load rental_data from local CSV...[0m


  df = pd.read_csv(cache_path, header='infer' if data_has_header else None)


✅ Data loaded, with shape (2804147, 10)
[34m
Load weather_data from local CSV...[0m
✅ Data loaded, with shape (35064, 6)
✅ X_processed, with shape (35064, 47)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,0.013333,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.227848,0.017391,1.0,0.0,0.629410,0.982963,0.75,0.933013,0.600779,0.989739
1,0.000000,0.000000,0.0,0.000000,0.051282,0.0,0.000000,0.0,0.0,0.000000,...,0.245570,0.008696,1.0,0.0,0.750000,0.933013,0.75,0.933013,0.600779,0.989739
2,0.013333,0.028571,0.0,0.000000,0.051282,0.0,0.000000,0.0,0.0,0.125000,...,0.303797,0.017391,1.0,0.0,0.853553,0.853553,0.75,0.933013,0.600779,0.989739
3,0.000000,0.114286,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.041667,...,0.341772,0.008696,1.0,0.0,0.933013,0.750000,0.75,0.933013,0.600779,0.989739
4,0.026667,0.028571,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.041667,...,0.356962,0.000000,1.0,0.0,0.982963,0.629410,0.75,0.933013,0.600779,0.989739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,0.066667,0.114286,0.0,0.071429,0.025641,0.0,0.066667,0.0,0.0,0.000000,...,0.202532,0.000000,0.0,1.0,0.066987,0.750000,0.50,1.000000,0.500000,1.000000
35060,0.053333,0.028571,0.0,0.000000,0.025641,0.0,0.000000,0.0,0.0,0.000000,...,0.172152,0.000000,0.0,1.0,0.146447,0.853553,0.50,1.000000,0.500000,1.000000
35061,0.000000,0.085714,0.0,0.000000,0.025641,0.0,0.000000,0.0,0.0,0.000000,...,0.182278,0.000000,0.0,1.0,0.250000,0.933013,0.50,1.000000,0.500000,1.000000
35062,0.066667,0.085714,0.0,0.000000,0.128205,0.0,0.000000,0.0,0.0,0.000000,...,0.222785,0.000000,0.0,1.0,0.370590,0.982963,0.50,1.000000,0.500000,1.000000


In [3]:
preprocess()

[34m
Load preprocessed data from local CSV...[0m


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,0.013333,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.227848,0.017391,1.0,0.0,0.629410,0.982963,0.75,0.933013,0.600779,0.989739
1,0.000000,0.000000,0.0,0.000000,0.051282,0.0,0.000000,0.0,0.0,0.000000,...,0.245570,0.008696,1.0,0.0,0.750000,0.933013,0.75,0.933013,0.600779,0.989739
2,0.013333,0.028571,0.0,0.000000,0.051282,0.0,0.000000,0.0,0.0,0.125000,...,0.303797,0.017391,1.0,0.0,0.853553,0.853553,0.75,0.933013,0.600779,0.989739
3,0.000000,0.114286,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.041667,...,0.341772,0.008696,1.0,0.0,0.933013,0.750000,0.75,0.933013,0.600779,0.989739
4,0.026667,0.028571,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.041667,...,0.356962,0.000000,1.0,0.0,0.982963,0.629410,0.75,0.933013,0.600779,0.989739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,0.066667,0.114286,0.0,0.071429,0.025641,0.0,0.066667,0.0,0.0,0.000000,...,0.202532,0.000000,0.0,1.0,0.066987,0.750000,0.50,1.000000,0.500000,1.000000
35060,0.053333,0.028571,0.0,0.000000,0.025641,0.0,0.000000,0.0,0.0,0.000000,...,0.172152,0.000000,0.0,1.0,0.146447,0.853553,0.50,1.000000,0.500000,1.000000
35061,0.000000,0.085714,0.0,0.000000,0.025641,0.0,0.000000,0.0,0.0,0.000000,...,0.182278,0.000000,0.0,1.0,0.250000,0.933013,0.50,1.000000,0.500000,1.000000
35062,0.066667,0.085714,0.0,0.000000,0.128205,0.0,0.000000,0.0,0.0,0.000000,...,0.222785,0.000000,0.0,1.0,0.370590,0.982963,0.50,1.000000,0.500000,1.000000
