## Conectando a fonte de dados AWS S3

In [0]:
# Connecting to S3 Bucket
import pandas as pd
import os

cred = pd.read_csv('./../src/databricks_user_accessKeys.csv') # Arquivo gerado automaticamente pelo AWS IAM

access_key = cred['access_key_ID'].iloc[0]
secret_key = cred['secret_access_key'].iloc[0]
bucket = "ny-taxi-case"

os.environ["AWS_ACCESS_KEY_ID"] = access_key
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key

## Perguntas

1. Qual a média de valor total (total_amount) recebido em um mês
considerando todos os yellow táxis da frota?

In [0]:
from pyspark.sql import functions as F

df = spark.read.format("parquet") \
      .option("header", "true") \
      .option("inferSchema", "true") \
      .load(f"s3://{bucket}/metrics_data/yellow/*") # Restricao para somente taxi yellow

df = (df.withColumn("month", F.month(F.col("date")))
            .withColumn("year", F.year(F.col("date")))
            .groupBy('month', 'year').agg(
                  F.sum(F.col("sum_total_amount")).alias('sum_total_amount'),
                  F.sum(F.col("total_rides")).alias('total_rides'))
                  .withColumn("avg_total_amount", F.col("sum_total_amount") / F.col("total_rides"))
            .orderBy("year", "month")
      )

df = df.select("month", "year", "avg_total_amount")
display(df)

month,year,avg_total_amount
1,2023,27.463529553924776
2,2023,27.37013170397401
3,2023,28.2890940801001
4,2023,28.78439609716948
5,2023,29.453385141507365


2. Qual a média de passageiros (passenger\_count) por cada hora do dia
que pegaram táxi no mês de maio considerando todos os táxis da
frota?

In [0]:
from pyspark.sql import functions as F

df = spark.read.format("parquet") \
      .option("header", "true") \
      .option("inferSchema", "true") \
      .load(f"s3://{bucket}/metrics_data/*/*") # Sem restricao de tipo de taxi

df = (df.filter(F.month(df.date) == 5) # Restricao de mes: Maio
        .select("hour", "sum_passenger_count")
        .groupBy("hour").agg(F.mean("sum_passenger_count").alias("avg_passenger_count_month"))
        .orderBy("hour")
        )

display(df)

hour,avg_passenger_count_month
0,2062.0967741935483
1,1349.5806451612902
2,881.0
3,573.9677419354839
4,363.8064516129032
5,384.483870967742
6,945.6290322580644
7,1945.258064516129
8,2677.870967741936
9,3045.6935483870966
