In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, avg, count, to_date
# import pandas as pd


# Spark Master의 IP와 포트 설정
spark_master_url = "spark://spark-master:7077"  # Spark Master의 IP와 포트

# 스파크 세션 생성
spark = SparkSession.builder \
    .appName("Spark with Hadoop and Spark Master") \
    .master(spark_master_url) \
    .config("spark.hadoop.fs.defaultFS", "hdfs://spark-master:9000") \
    .getOrCreate()

# 데이터 읽기 (예: HDFS에서 Parquet 파일)
# df = spark.read.parquet("hdfs://spark-master:9000//user/hadoop/input/TLC_Tripdata_Jan_2024.parquet")
# df = spark.read.parquet("/Users/admin/Desktop/HMG_W2/missions/W5/M1/docker/untracked/TLC Tripdata Jan 2024.parquet")

# 데이터 로딩
df = spark.read.parquet("hdfs://spark-master:9000/user/hadoop/input/TLC_Tripdata_Jan_2024.parquet")

# 데이터 클리닝
df_clean = df.dropna(subset=["base_passenger_fare", "trip_miles"]) \
    .filter((col("base_passenger_fare") > 0) & (col("trip_miles") > 0))

# 변환 로직
df_transformed = df_clean.select(
    col("pickup_datetime").alias("date"),
    col("base_passenger_fare").cast("double").alias("fare_amount"),
    col("trip_miles").cast("double").alias("trip_distance")
)

# 날짜 형식으로 변환
df_transformed = df_transformed.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))

# 집계 로직
df_aggregated = df_transformed.groupBy("date").agg(
    count("*").alias("total_trips"),
    _sum("fare_amount").alias("total_revenue"),
    avg("trip_distance").alias("avg_trip_distance")
)

# 전체 집계 결과
total_trips = df_transformed.count()
total_revenue = df_transformed.agg(_sum("fare_amount")).collect()[0][0]
avg_trip_distance = df_transformed.agg(avg("trip_distance")).collect()[0][0]

# 일별 집계 결과 표시
df_aggregated.show()

# 전체 집계 결과 표시
print(f"Total Trips: {total_trips}")
print(f"Total Revenue: {total_revenue}")
print(f"Average Trip Distance: {avg_trip_distance}")

# 결과 저장
output_path = "hdfs://spark-master:9000/output"
df_aggregated.write.mode("overwrite").parquet(f"{output_path}/daily_metrics")
df_transformed.write.mode("overwrite").parquet(f"{output_path}/cleaned_data")

# CSV 형식으로 저장
df_aggregated.write.mode("overwrite").csv(f"{output_path}/daily_metrics_csv")
df_transformed.write.mode("overwrite").csv(f"{output_path}/cleaned_data_csv")

In [None]:
# SparkSession 종료
spark.stop()