# W5M2 - Optimization

## 라이브러리 및 세션 설정

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import isnull, avg, min, date_format
from operator import add

spark = SparkSession.builder \
    .master('spark://spark-master:7077') \
    .appName('W5M2') \
    .config('spark.executor.memory', '4gb') \
    .config("spark.executor.cores", "5") \
    .getOrCreate()


## 데이터 로딩

In [None]:
TLC_data_path = 'hdfs://spark-master:9000/user/hduser/hdfs_data/fhvhv_tripdata_2023-01.parquet'
weather_data_path = 'hdfs://spark-master:9000/user/hduser/hdfs_data/weather.csv'
output_dir_path = 'hdfs://spark-master:9000/user/spark_user/W5M2_output/'
tlc_ext = 'parquet'
weather_ext = 'csv'

def load_dataframe(spark_session, file_path, extension):
    if extension == "csv":
        df = spark_session.read.csv(file_path, header=True, inferSchema=True)
    elif extension == "parquet":
        df = spark_session.read.parquet(file_path)
    else:
        raise NotImplementedError("Unsupported file extension.")
    return df

df = load_dataframe(spark, TLC_data_path, tlc_ext)
print("- The schema of the TLC DataFrame - \n", df.schema)
df.show(1, vertical=True)


## 데이터 클리닝

In [None]:
# Remove invalid or null entries and filter out unrealistic values
df = df.na.drop('any').filter(df.driver_pay > 0).filter(df.base_passenger_fare > 0)
df.show(5)  # Check the top 5 rows after cleaning

## 데이터 변환

In [None]:
# Apply various transformations
df = df.withColumn("pickup_date", date_format(df.pickup_datetime, 'yyyy-MM-dd'))
df = df.select("pickup_date", "base_passenger_fare", "trip_miles")
df.cache()

short_trip_df = df.filter(df.trip_miles < 10)
per_day_total_revenue_df = df.groupBy("pickup_date").sum("base_passenger_fare").orderBy("pickup_date")
per_day_avg_trip_miles_df = df.groupBy("pickup_date").mean("trip_miles").orderBy("pickup_date")


## 데이터 액션 및 저장

In [None]:
# Execute actions to trigger the transformations
print("Sample Short Trip Data: ", short_trip_df.take(1))
print("Sample Per Day Total Revenue: ", per_day_total_revenue_df.take(1))
print("Sample Per Day Average Trip Miles: ", per_day_avg_trip_miles_df.take(1))

# Save the results to specified storage format
df.coalesce(1).write.mode('overwrite').csv(output_dir_path + "df")
short_trip_df.coalesce(1).write.mode('overwrite').csv(output_dir_path + "short_trip_df")
per_day_total_revenue_df.coalesce(1).write.mode('overwrite').csv(output_dir_path + "per_day_total_revenue_df")
per_day_avg_trip_miles_df.coalesce(1).write.mode('overwrite').csv(output_dir_path + "per_day_avg_trip_miles_df")
