In [2]:
# This dataset contains historical records accumulated from 2009 to 2018
# Pandas Dataframe:
from azureml.opendatasets import NycTlcYellow
from datetime import datetime
import calendar
import pandas as pd

# Initialize isFileCreated, set it as false
isFileCreated = False

# Iterate data by year and month each time
for year in range(2009,2019):

    for month in range(1, 13):
        start_date = datetime(year, month, 1)

        # Extract the number of days in the current month
        num_days = calendar.monthrange(year, month)[1]
        end_date = datetime(year, month, num_days)

        # Extract the dataset
        nyc_tlc = NycTlcYellow(start_date=start_date, end_date=end_date)

        # Convert dataset to a pandas data frame
        df = nyc_tlc.to_pandas_dataframe()

        # Convert pickup date time to desired date type to extract year and month
        df['tpepPickupDateTime'] = pd.to_datetime(df['tpepPickupDateTime'])
        df['year'] = df['tpepPickupDateTime'].dt.year
        df['month'] = df['tpepPickupDateTime'].dt.month

        # Clean data
        df.loc[df['paymentType'].isin(['Credit','CREDIT','CRD','CRE','Cre', '1']), 'paymentType'] = 'Credit card'
        df.loc[df['paymentType'].isin(['CAS','CASH','CSH', 'Cash','Cas','2']), 'paymentType'] = 'Cash'
        df.loc[df['paymentType'].isin(['No Charge','NOC','No', '3']), 'paymentType'] = 'No Charge'
        df.loc[df['paymentType'].isin(['Dispute','DIS', 'Dis','4']), 'paymentType'] = 'Dispute'
        df.loc[df['paymentType'].isin(['Unknown','UNK','NA', '5']), 'paymentType'] = 'Unknown'
        df.loc[df['paymentType'].isin(['Voided trip', '6']), 'paymentType'] = 'Voided trip'
        mask_unknown = df['paymentType'].str.contains('40.|0|NA')
        df.loc[mask_unknown, 'paymentType'] = 'Unknown'
        mask_no = df['paymentType'].str.contains('No')
        df.loc[mask_no, 'paymentType'] = 'No Charge'

        # Imputation to replace missing data with 0
        df['fareAmount'] = df['fareAmount'].fillna(0)

        # Optimize datatype to reduce memory usage       
        for col in df.select_dtypes(include=['float64']).columns:
            df[col] = df[col].astype('float32')         
        df['paymentType'] = df['paymentType'].astype('category')

        # Perform Aggregation
        result_df = df.groupby(by=['paymentType','year','month'])[['fareAmount','totalAmount','passengerCount']].agg(['mean','median']).round(2)
        # Flatten the multiIndex columns
        result_df.columns = [f'{col}_{agg}' for col, agg in result_df.columns]

        # Rename the columns for better clarity
        result_df.rename(columns={"fareAmount_mean": 'mean_costAmount',
                                    'fareAmount_median': 'median_costAmount',
            'totalAmount_mean': 'mean_priceAmount',
            'totalAmount_median': 'median_priceAmount',
            'passengerCount_mean':'mean_passengerCount',	
            'passengerCount_median':'median_passengerCount'
            },inplace=True)

        # if file is not created 
        if not isFileCreated:
            # output the data frame to a csv file
            result_df.to_csv('ts.csv',index=True)
            # Set isFileCreated to true because the file is created
            isFileCreated = True
        else:
            # Append data frame to existing csv
            result_df.to_csv('ts.csv', mode='a', header=False, index=True)
        
        # Release memory
        del df
        del nyc_tlc

{'infer_column_types': 'False', 'activity': 'download'}
{'infer_column_types': 'False', 'activity': 'download', 'activityApp': 'FileDataset'}
[Info] read from C:\Users\tt861\AppData\Local\Temp\tmpr5gh4zx4\https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2009/puMonth=1/part-00000-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426339-6.c000.snappy.parquet
[Info] read from C:\Users\tt861\AppData\Local\Temp\tmpr5gh4zx4\https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2009/puMonth=1/part-00012-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426337-5.c000.snappy.parquet
[Info] read from C:\Users\tt861\AppData\Local\Temp\tmpr5gh4zx4\https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2009/puMonth=1/part-00001-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426336-5.c000.snappy.parquet
[Info] read from C:\Users\tt861\AppData\Local\Temp\tmpr5gh4zx4\https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/p