In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from dateutil.parser import parse
import os
from pathlib import Path
from bikesharing.ml_logic.data import get_raw_data, get_polygons
from bikesharing.ml_logic.encoders import encode_district_label
from bikesharing.params import *

In [2]:
query =f'''
        SELECT *
        FROM `{GCP_PROJECT}.{BQ_DATASET}.raw_data_mvg`
    '''

In [3]:
mvg_data = get_raw_data(GCP_PROJECT, query=query, cache_path=Path(f'{LOCAL_DATA_PATH}/raw/raw_{START_YEAR}_to_{END_YEAR}.csv'))
mvg_data.shape

[34m
Load data from local CSV...[0m


  df = pd.read_csv(cache_path, header='infer' if data_has_header else None)


✅ Data loaded, with shape (2804147, 10)


(2804147, 10)

In [4]:
mvg_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2804147 entries, 0 to 2804146
Data columns (total 10 columns):
 #   Column               Dtype  
---  ------               -----  
 0   STARTTIME            object 
 1   ENDTIME              object 
 2   STARTLAT             float64
 3   STARTLON             float64
 4   ENDLAT               float64
 5   ENDLON               float64
 6   RENTAL_IS_STATION    float64
 7   RENTAL_STATION_NAME  object 
 8   RETURN_IS_STATION    float64
 9   RETURN_STATION_NAME  object 
dtypes: float64(6), object(4)
memory usage: 213.9+ MB


In [5]:
df = mvg_data[['STARTTIME', 'STARTLAT', 'STARTLON']]

In [6]:
polygons = get_polygons()
df_districts = encode_district_label(df, polygons=polygons)
df_districts.head()

Unnamed: 0,STARTTIME,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,...,Schwanthalerhöhe,Sendling,Sendling-Westpark,Südgiesing,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach
2,2019-01-01 15:29:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,2019-01-05 12:19:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,2019-01-06 08:31:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44,2019-01-07 17:32:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52,2019-01-08 08:59:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_districts.to_csv(f'{LOCAL_DATA_PATH}/processed/df_districts.csv', index=False)

In [12]:
data = df.merge(df_districts, on='STARTTIME')
data.head()

: 

: 

In [None]:
data.to_csv(f'{LOCAL_DATA_PATH}/processed/data_with_districts.csv', index=False)

In [7]:
min_date = parse('2022-01-01').strftime('%Y-%m-%d')
df_test = mvg_data[mvg_data['STARTTIME'] >= min_date]

In [8]:
df_test.tail()

Unnamed: 0,STARTTIME,ENDTIME,STARTLAT,STARTLON,ENDLAT,ENDLON,RENTAL_IS_STATION,RENTAL_STATION_NAME,RETURN_IS_STATION,RETURN_STATION_NAME
2804142,2022-12-17 19:39:00,2022-12-17 19:42,48.161991,11.58654,48.161251,11.59206,1.0,Münchner Freiheit,0.0,
2804143,2022-12-20 19:40:00,2022-12-20 19:48,48.161991,11.58654,48.162029,11.56343,1.0,Münchner Freiheit,0.0,
2804144,2022-12-22 17:03:00,2022-12-22 17:15,48.161991,11.58654,48.180382,11.59943,1.0,Münchner Freiheit,0.0,
2804145,2022-12-28 09:42:00,2022-12-28 09:58,48.161991,11.58654,48.162739,11.57268,1.0,Münchner Freiheit,0.0,
2804146,2022-12-31 10:01:00,2022-12-31 10:19,48.161991,11.58654,48.157879,11.57722,1.0,Münchner Freiheit,0.0,


In [9]:
encode_district_label

<function bikesharing.ml_logic.encoders.encode_district_label(rental_df: pandas.core.frame.DataFrame, polygons: dict) -> pandas.core.frame.DataFrame>