In [None]:
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
from azureml.opendatasets import NycTlcGreen
from functools import reduce 
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import mean, median

print("start")

spark = SparkSession.builder.master("local[*]")\
.config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.6,com.microsoft.azure:azure-storage:8.6.6").getOrCreate()
if (spark.getActiveSession()):
    print('yes')
else:
    print('no')

print(spark.sparkContext.getConf().get("spark.jars.packages"))

# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "yellow"
blob_sas_token = "r"

# Allow SPARK to read from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Remote blob path: ' + wasbs_path)

if (spark.getActiveSession()):
    print('yes')
else:
    print('no')

print(spark.sparkContext.getConf().get("spark.jars.packages"))

# read parquet, note that it won't load any data yet by now
taxi_df = spark.read.parquet(wasbs_path)
print('taxi_df is created')

# Extract year and month from the pickup_datetime column
taxi_df = taxi_df.withColumn("year", F.year(F.col("tpepPickupDateTime")))
taxi_df = taxi_df.withColumn("month", F.month(F.col("tpepPickupDateTime")))
# Impute missing values with 0
taxi_df = taxi_df.fillna(0, subset=["fareAmount"])

# Map values to the correct ones
taxi_df = taxi_df.withColumn("paymentType", F.when(F.col("paymentType").isin(['Credit','CREDIT','CRD','CRE','Cre', '1']), "Credit Card")\
                                      .when(F.col("paymentType").isin(['CAS','CASH','CSH', 'Cash','Cas','2']), "Cash")\
                                       .when(F.col("paymentType").isin(['No Charge','NOC','No', '3']), "No Charge")\
                                       .when(F.col("paymentType").isin(['Dispute','DIS', 'Dis','4']), "Dispute")\
                                       .when(F.col("paymentType").isin(['Unknown','UNK','NA', '5']), "Unknown")\
                                       .when(F.col("paymentType").isin(['Voided trip', '6']), "Voided trip")\
                                       .when(F.col('paymentType').contains('No'), 'No Charge')\
                                       .when(F.col('paymentType').rlike('40.|0|NA'), 'Unknown')
                          )



# Perform aggregation
result_df = taxi_df.groupBy("paymentType", "year", "month") \
            .agg(mean("fareAmount").alias("mean_costAmount"),
                 median("fareAmount").alias("median_costAmount"),
                 mean("totalAmount").alias("mean_priceAmount"),
                 median("totalAmount").alias("median_priceAmount"),
                 mean("passengerCount").alias("mean_passengerCount"),
                 median("passengerCount").alias("median_passengerCount")) 
    
    # Write each chunk as a separate Parquet file or partition
result_df.write.mode("overwrite").partitionBy("year", "month").parquet("NYC_T&L_Yellow")
print('result_df parquet files are created')

# stop spark session
spark.stop()