# W5M1 - RDD

## 라이브러리 및 세션 설정

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import isnan, when, count, col, isnull, avg, min
import pyspark.sql.functions as F
from operator import add

spark = SparkSession.builder \
    .master('spark://spark-master:7077') \
    .appName('W5M1') \
    .config('spark.executor.memory', '4gb') \
    .config("spark.executor.cores", "5") \
    .getOrCreate()


## 데이터 로딩

In [None]:
input_file_path = 'hdfs://spark-master:9000/user/hduser/hdfs_data/fhvhv_tripdata_2023-01.parquet'
output_dir_path = 'hdfs://spark-master:9000/user/spark_user/W5M1_output/'
ext = 'parquet'
name = "TLC-2023-01"

def load_data_rdd(spark_session, file_path, extension, name):
    if extension == "csv":
        data_rdd = spark_session.read.csv(file_path).rdd
    elif extension == "parquet":
        data_rdd = spark_session.read.parquet(file_path).rdd
    else:
        raise NotImpelentedError
    data_rdd.setName(name)
    return data_rdd

data_rdd = load_data_rdd(spark, input_file_path, ext, name)
data_rdd.take(1)


## 데이터 클리닝

In [None]:
def remove_row_w_none_val(row):
    for val in row:
        if val is None:
            return
    return row

print("Before data cleaning: ", data_rdd.count())
data_rdd = data_rdd.filter(lambda row: remove_row_w_none_val(row))
print("After data cleaning: ", data_rdd.count())


## 변환 로직

In [None]:
def remove_non_positive_fare(row):
    if row.base_passenger_fare > 0:
        return row
    else:
        return

print("Before removing zero or negative fare: ", data_rdd.count())
data_rdd = data_rdd.filter(lambda row: remove_non_positive_fare(row))
print("After removing zero or negative fare: ", data_rdd.count())


## 데이터 매핑 및 변환

In [None]:
def extract_and_convert_relevant_columns(row):
    return Row(pickup_datetime=row.pickup_datetime.date(), trip_miles=row.trip_miles, base_passenger_fare=row.base_passenger_fare)

data_rdd = data_rdd.map(lambda row: extract_and_convert_relevant_columns(row))
data_rdd.take(1)


## 집계

In [None]:
total_number_of_trips = data_rdd.count()
print(f"total_number_of_trips: {total_number_of_trips} miles")

total_revenue = data_rdd.map(lambda row: row.base_passenger_fare).reduce(add)
print(f"total_revenue: {round(total_revenue, 2)}$")

average_trip_distance = data_rdd.map(lambda row: row.trip_miles).mean()
print(f"average_trip_distance: round(average_trip_distance, 2) miles")

number_of_trips_per_day = data_rdd.map(lambda row: (row.pickup_datetime, 1)).reduceByKey(add).sortByKey(lambda row: row.pickup_datetime)
number_of_trips_per_day.take(20)

total_revenue_per_day = data_rdd.map(lambda row: (row.pickup_datetime, row.base_passenger_fare)).reduceByKey(add).sortByKey(lambda row: row.pickup_datetime)
total_revenue_per_day.take(20)


## 데이터 출력

In [None]:
# Save the output as text
result = spark.sparkContext.parallelize([
    f"total_number_of_trips, {total_number_of_trips}",
    f"total_revenue, {total_revenue}",
    f"average_trip_distance, {average_trip_distance}",
])
result.coalesce(1).saveAsTextFile(output_dir_path + "result.txt")

# Save the output as pickle object
number_of_trips_per_day.coalesce(1).saveAsPickleFile(output_dir_path + "number_of_trips_per_day")
total_revenue_per_day.coalesce(1).saveAsPickleFile(output_dir_path + "total_revenue_per_day")
