In [37]:
import os
import pandas as pd
import numpy as np
import copy

project_path = '/Users/reshma/AI/MLOPS Project/taxi_demand_predictor/'

## 1. Data Transformation

In [16]:
def transform_to_timeseries(validated_files):
    all_dfs = []
    for f in validated_files:
        f_path = validated_path + f
        df = pd.read_parquet(path = f_path)
        #display(df.head(2))

        df = pd.read_parquet(path = f_path)
        df.set_index('pickup_time',inplace=True)  #Make pickup_time as index i.e., DateTimeIndex
        df = df.groupby('pickup_location').resample('1H').count()
        df = df.rename(columns={'pickup_location': 'count_pickup_loc'})
        df.reset_index(inplace=True) #Make all indexes as columns
        df.set_index('pickup_time',inplace=True)
        df = df.sort_index()
        #display(df.head(2))

        date_series = pd.date_range(start=df.index.min(),end=df.index.max(),freq='1H')
        unique_taxi_series = df.index.unique()
        if unique_taxi_series.equals(date_series):
            print("No missing indexes in time series data from datafile :",f)
        else:
            print("Fill missing indexes with 0")
            df.reindex(date_series, fill_value='0')
        
        all_dfs.append(df)
    final_df = pd.concat(all_dfs)
    return final_df

In [17]:
validated_path = project_path + 'data/validated/'
validated_files = [f for f in os.listdir(validated_path) if 'parquet' in f]
validated_files.sort()
df = transform_to_timeseries(validated_files)
df

No missing indexes in time series data from datafile : rides_2023-01.parquet
No missing indexes in time series data from datafile : rides_2023-02.parquet
No missing indexes in time series data from datafile : rides_2023-03.parquet
No missing indexes in time series data from datafile : rides_2023-04.parquet
No missing indexes in time series data from datafile : rides_2023-05.parquet
No missing indexes in time series data from datafile : rides_2023-06.parquet
No missing indexes in time series data from datafile : rides_2023-07.parquet
No missing indexes in time series data from datafile : rides_2023-08.parquet
No missing indexes in time series data from datafile : rides_2023-09.parquet


Unnamed: 0_level_0,pickup_location,count_pickup_loc
pickup_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 00:00:00,74,13
2023-01-01 00:00:00,159,1
2023-01-01 00:00:00,158,53
2023-01-01 00:00:00,112,2
2023-01-01 00:00:00,89,1
...,...,...
2023-09-30 23:00:00,75,10
2023-09-30 23:00:00,37,1
2023-09-30 23:00:00,162,101
2023-09-30 23:00:00,25,10


In [78]:
copy_df = copy.deepcopy(df)
copy_df

Unnamed: 0_level_0,pickup_location,count_pickup_loc
pickup_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 00:00:00,74,13
2023-01-01 00:00:00,159,1
2023-01-01 00:00:00,158,53
2023-01-01 00:00:00,112,2
2023-01-01 00:00:00,89,1
...,...,...
2023-09-30 23:00:00,75,10
2023-09-30 23:00:00,37,1
2023-09-30 23:00:00,162,101
2023-09-30 23:00:00,25,10


In [79]:
date_series = pd.date_range(start=copy_df.index.min(),end=copy_df.index.max(),freq='30min')
new_df = pd.DataFrame(data=0,index=np.arange(len(date_series)),columns=[col for col in copy_df.columns])
new_df.set_index(date_series,inplace=True)
new_df.reset_index(inplace=True)
new_df.columns = ['pickup_time','pickup_location','count_pickup_loc']
new_df

Unnamed: 0,pickup_time,pickup_location,count_pickup_loc
0,2023-01-01 00:00:00,0,0
1,2023-01-01 00:30:00,0,0
2,2023-01-01 01:00:00,0,0
3,2023-01-01 01:30:00,0,0
4,2023-01-01 02:00:00,0,0
...,...,...,...
13098,2023-09-30 21:00:00,0,0
13099,2023-09-30 21:30:00,0,0
13100,2023-09-30 22:00:00,0,0
13101,2023-09-30 22:30:00,0,0


In [80]:
copy_df.reset_index(inplace=True)
copy_df

Unnamed: 0,pickup_time,pickup_location,count_pickup_loc
0,2023-01-01 00:00:00,74,13
1,2023-01-01 00:00:00,159,1
2,2023-01-01 00:00:00,158,53
3,2023-01-01 00:00:00,112,2
4,2023-01-01 00:00:00,89,1
...,...,...,...
1581601,2023-09-30 23:00:00,75,10
1581602,2023-09-30 23:00:00,37,1
1581603,2023-09-30 23:00:00,162,101
1581604,2023-09-30 23:00:00,25,10


In [81]:
final_df = pd.concat([copy_df,new_df])
final_df.set_index('pickup_time',inplace=True)
final_df

Unnamed: 0_level_0,pickup_location,count_pickup_loc
pickup_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 00:00:00,74,13
2023-01-01 00:00:00,159,1
2023-01-01 00:00:00,158,53
2023-01-01 00:00:00,112,2
2023-01-01 00:00:00,89,1
...,...,...
2023-09-30 21:00:00,0,0
2023-09-30 21:30:00,0,0
2023-09-30 22:00:00,0,0
2023-09-30 22:30:00,0,0
