In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Spark Master의 IP와 포트 설정
spark_master_url = "spark://spark-master:7077"  # Spark Master의 IP와 포트

# 스파크 세션 생성
spark = SparkSession.builder \
    .appName("Spark with Hadoop and Spark Master") \
    .master(spark_master_url) \
    .config("spark.hadoop.fs.defaultFS", "hdfs://spark-master:9000") \
    .getOrCreate()

# 데이터 읽기 (예: HDFS에서 Parquet 파일)
rdd = spark.read.parquet("hdfs://spark-master:9000/user/hadoop/input/TLC_Tripdata_Jan_2024.parquet").rdd

# 데이터 클리닝
rdd_clean = rdd.filter(lambda x: x['base_passenger_fare'] is not None and x['trip_miles'] is not None) \
    .filter(lambda x: x['base_passenger_fare'] > 0 and x['trip_miles'] > 0)

# 변환 로직
rdd_transformed = rdd_clean.map(lambda x: Row(
    date=x['pickup_datetime'].date(),  # datetime 객체에서 date() 메서드 호출
    fare_amount=float(x['base_passenger_fare']),
    trip_distance=float(x['trip_miles'])
))

# 집계 로직
rdd_aggregated = rdd_transformed.map(lambda x: (x.date, (1, x.fare_amount, x.trip_distance))) \
    .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2])) \
    .map(lambda x: (x[0], x[1][0], x[1][1], x[1][2] / x[1][0])) \
    .map(lambda x: Row(date=x[0], total_trips=x[1], total_revenue=x[2], avg_trip_distance=x[3]))

# 전체 집계 결과
total_trips = rdd_transformed.count()
total_revenue = rdd_transformed.map(lambda x: x.fare_amount).reduce(lambda a, b: a + b)
avg_trip_distance = rdd_transformed.map(lambda x: x.trip_distance).mean()

# 일별 집계 결과 표시
for row in rdd_aggregated.collect():
    print(row)

# 전체 집계 결과 표시
print(f"Total Trips: {total_trips}")
print(f"Total Revenue: {total_revenue}")
print(f"Average Trip Distance: {avg_trip_distance}")

# 결과 저장
output_path = "hdfs://spark-master:9000/output"
rdd_aggregated.map(lambda x: (x.date, x.total_trips, x.total_revenue, x.avg_trip_distance)).saveAsTextFile(f"{output_path}/daily_metrics_rdd")
