In [1]:
import os
import pandas as pd

project_path = '/Users/reshma/AI/MLOPS Project/taxi_demand_predictor/'

## 1. Data Transformation

In [2]:
def transform_to_timeseries(validated_files):
    all_dfs = []
    for f in validated_files:
        f_path = validated_path + f
        df = pd.read_parquet(path = f_path)
        #display(df.head(2))

        df = pd.read_parquet(path = f_path)
        df.set_index('pickup_time',inplace=True)  #Make pickup_time as index i.e., DateTimeIndex
        df = df.groupby('pickup_location').resample('1H').count()
        df = df.rename(columns={'pickup_location': 'count_pickup_loc'})
        df.reset_index(inplace=True) #Make all indexes as columns
        df.set_index('pickup_time',inplace=True)
        df = df.sort_index()
        #display(df.head(2))

        date_series = pd.date_range(start=df.index.min(),end=df.index.max(),freq='1H')
        unique_taxi_series = df.index.unique()
        if unique_taxi_series.equals(date_series):
            print("No missing indexes in time series data from datafile :",f)
        else:
            print("Fill missing indexes with 0")
            df.reindex(date_series, fill_value='0')
        
        all_dfs.append(df)
    final_df = pd.concat(all_dfs)
    return final_df

In [64]:
validated_path = project_path + 'data/validated/'
validated_files = [f for f in os.listdir(validated_path) if 'parquet' in f]
validated_files.sort()
df = transform_to_timeseries(validated_files)
df

No missing indexes in time series data from datafile : rides_2023-01.parquet
No missing indexes in time series data from datafile : rides_2023-02.parquet
No missing indexes in time series data from datafile : rides_2023-03.parquet
No missing indexes in time series data from datafile : rides_2023-04.parquet
No missing indexes in time series data from datafile : rides_2023-05.parquet
No missing indexes in time series data from datafile : rides_2023-06.parquet
No missing indexes in time series data from datafile : rides_2023-07.parquet
No missing indexes in time series data from datafile : rides_2023-08.parquet


Unnamed: 0_level_0,pickup_location,count_pickup_loc
pickup_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 00:00:00,74,13
2023-01-01 00:00:00,159,1
2023-01-01 00:00:00,158,53
2023-01-01 00:00:00,112,2
2023-01-01 00:00:00,89,1
...,...,...
2023-08-31 23:00:00,88,8
2023-08-31 23:00:00,48,165
2023-08-31 23:00:00,181,1
2023-08-31 23:00:00,236,27


In [66]:
copy_df = df
copy_df

Unnamed: 0_level_0,pickup_location,count_pickup_loc
pickup_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 00:00:00,74,13
2023-01-01 00:00:00,159,1
2023-01-01 00:00:00,158,53
2023-01-01 00:00:00,112,2
2023-01-01 00:00:00,89,1
...,...,...
2023-08-31 23:00:00,88,8
2023-08-31 23:00:00,48,165
2023-08-31 23:00:00,181,1
2023-08-31 23:00:00,236,27


In [67]:
date_series = pd.date_range(start=copy_df.index.min(),end=copy_df.index.max(),freq='30min')
date_series = date_series.sort_values()
unique_taxi_series = copy_df.index.unique()
copy_df

Unnamed: 0_level_0,pickup_location,count_pickup_loc
pickup_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 00:00:00,74,13
2023-01-01 00:00:00,159,1
2023-01-01 00:00:00,158,53
2023-01-01 00:00:00,112,2
2023-01-01 00:00:00,89,1
...,...,...
2023-08-31 23:00:00,88,8
2023-08-31 23:00:00,48,165
2023-08-31 23:00:00,181,1
2023-08-31 23:00:00,236,27


In [71]:
combined_index = pd.Index.union(copy_df.index, date_series)

In [None]:
df_combined = copy_df.reindex(combined_index)