In [15]:
import os
import pandas as pd
import plotly.express as px

project_path = '/Users/reshma/AI/MLOPS Project/taxi_demand_predictor/'

## 1. Data Transformation

In [16]:
def handle_missing_indexes_ts(copy_df):
    
    #Create date series and then dataframe with 0 as the row values
    
    date_series = pd.date_range(start=copy_df.index.min(),end=copy_df.index.max(),freq='1H')
    new_df = pd.DataFrame(data=0,index=np.arange(len(date_series)),columns=[col for col in copy_df.columns])
    new_df.set_index(date_series,inplace=True) #Make date_series as index i.e., DateTimeIndex
    new_df.reset_index(inplace=True) #Make all indexes as columns
    new_df.columns = ['pickup_time','pickup_location','count_pickup_loc'] #Rename columns
    
    copy_df.reset_index(inplace=True)  #Make all indexes as columns
    
    #Compare both DFs and retain in date_series_df the times not matching data_DF
    new_df = new_df[~new_df['pickup_time'].isin(copy_df['pickup_time'])]
    
    final_df = pd.concat([copy_df,new_df])
    final_df.set_index('pickup_time',inplace=True) #Make pickup_time as index i.e., DateTimeIndex

    return final_df

In [19]:
def transform_to_timeseries(validated_files,validated_path,transformed_path):
    #all_dfs = []
    for f in validated_files:
        f_path = validated_path + f
        df = pd.read_parquet(path = f_path)
        #display(df.head(2))

        df = pd.read_parquet(path = f_path)
        df.set_index('pickup_time',inplace=True)  #Make pickup_time as index i.e., DateTimeIndex
        df = df.groupby('pickup_location').resample('1H').count()
        df = df.rename(columns={'pickup_location': 'count_pickup_loc'})
        df.reset_index(inplace=True) #Make all indexes as columns
        df.set_index('pickup_time',inplace=True)
        df = df.sort_index()
        #display(df.head(2))

        date_series = pd.date_range(start=df.index.min(),end=df.index.max(),freq='1H')
        unique_taxi_series = df.index.unique()
        if unique_taxi_series.equals(date_series):
            print("No missing indexes in time series data from datafile :",f)
        else:
            print("For times with no rides i.e., missing indexes, fill rows with 0 :",f)
            df = handle_missing_indexes_ts(df)
        
        #Save each DF as transformed datafile back to disk
        trans_path = transformed_path + f
        df.to_parquet(path=trans_path) #compression='snappy', index=None
        
        #all_dfs.append(df)
    
    #Delete validated files
    for f in validated_files:
        os.remove(validated_path + '/' + f)
        
    #final_df = pd.concat(all_dfs)
    #return final_df

In [20]:
validated_path = project_path + 'data/validated/'
validated_files = [f for f in os.listdir(validated_path) if 'parquet' in f]
validated_files.sort()

transformed_path = project_path + 'data/transformed/'

transform_to_timeseries(validated_files,validated_path,transformed_path)

No missing indexes in time series data from datafile : rides_2023-01.parquet
No missing indexes in time series data from datafile : rides_2023-02.parquet
No missing indexes in time series data from datafile : rides_2023-03.parquet
No missing indexes in time series data from datafile : rides_2023-04.parquet
No missing indexes in time series data from datafile : rides_2023-05.parquet
No missing indexes in time series data from datafile : rides_2023-06.parquet
No missing indexes in time series data from datafile : rides_2023-07.parquet
No missing indexes in time series data from datafile : rides_2023-08.parquet
No missing indexes in time series data from datafile : rides_2023-09.parquet


## 2. Data Visualization

In [27]:
def visualize_ts_data(df,pickup_loc_id):
    visual_df = df[df.pickup_location == pickup_loc_id]
    fig = px.line(visual_df,x=visual_df.index,y='count_pickup_loc')
    fig.show()

In [30]:
transformed_path = project_path + 'data/transformed/'
transformed_files = [f for f in os.listdir(transformed_path) if 'parquet' in f]
transformed_files.sort()

df = pd.read_parquet(path = transformed_path + transformed_files[0])
visualize_ts_data(df,pickup_loc_id=43)