In [83]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, count, to_date
from pyspark.sql.utils import AnalysisException
import requests
import os
import pandas as pd

In [84]:
# Spark session 초기화
spark = (
    SparkSession.builder.appName("TLC DataSet Analyze")
    .master("local[*]")
    .config("spark.driver.host", "localhost")
    .config("spark.driver.port", "7077")
    .config("spark.ui.port", "4050")
    .config("spark.driver.memory", "15g")
    .config("spark.executor.memory", "15g")
    .getOrCreate()
)

In [85]:
# # 저장할 폴더 경로
# save_folder = "/Users/admin/Desktop/docker/W5M1/parquet_files"
# os.makedirs(save_folder, exist_ok=True)  # 폴더가 없으면 생성

# # 데이터 종류 및 기간 설정
# categories = [
#     "yellow_tripdata"
# ]  # , "green_tripdata", "fhv_tripdata", "fhvhv_tripdata"]

# # 날짜 범위 생성
# months = (
#     pd.date_range(start="2022-05-01", end="2022-08-01", freq="MS")
#     .strftime("%Y-%m")
#     .tolist()
# )

# # URL을 생성하여 데이터를 다운로드 및 저장
# base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
# for category in categories:
#     for month in months:
#         file_name = f"{category}_{month}.parquet"
#         url = f"{base_url}{file_name}"
#         file_path = os.path.join(save_folder, file_name)

#         # 데이터 다운로드
#         response = requests.get(url)

#         # 응답 상태 코드 확인
#         if response.status_code == 200:
#             # 파일을 로컬 폴더에 저장
#             with open(file_path, "wb") as file:
#                 file.write(response.content)
#         else:
#             print(f"Failed to retrieve the data from {url}")

In [86]:
# 데이터 정제
def data_purification(file_name, date):

    # parquet file 읽기
    file_path = f"/Users/admin/Desktop/docker/W5M1/parquet_files/{file_name}_tripdata_{date}.parquet"  # 로컬
    print(f"Attempting to read from: {file_path}")
    try:
        # 파일 읽기
        read_df = spark.read.parquet(file_path)
    except AnalysisException:
        print(f"File not found: {file_path}")
        return None  # 파일이 없으면 None 반환

    # date를 이용해 해당 월의 첫 날과 마지막 날을 계산
    start_date = f"{date}-01"
    end_date = f"{date}-{(pd.Timestamp(start_date) + pd.offsets.MonthEnd(1)).day}"

    # 이상치, 음수값 제거
    read_df = (
        read_df.filter(col("tpep_dropoff_datetime") > col("tpep_pickup_datetime"))
        .filter(col("trip_distance") > 0)
        .filter(col("passenger_count") >= 0)
        .filter(col("fare_amount") >= 0)
        .filter(col("extra") >= 0)
        .filter(col("mta_tax") >= 0)
        .filter(col("tip_amount") >= 0)
        .filter(col("tolls_amount") >= 0)
        .filter(col("improvement_surcharge") >= 0)
        .filter(col("total_amount") >= 0)
        .filter(col("congestion_surcharge") >= 0)
        .filter(col("Airport_fee") >= 0)
        .filter(
            (col("tpep_pickup_datetime") >= start_date)
            & (col("tpep_pickup_datetime") <= end_date)
            & (col("tpep_dropoff_datetime") >= start_date)
            & (col("tpep_dropoff_datetime") <= end_date)
        )
    )

    # 결측치 있는 행 제거
    read_df = read_df.dropna()

    read_df = read_df.withColumn(
        "total_fee",
        (col("total_amount") + col("congestion_surcharge") + col("airport_fee")),
    )

    read_df = read_df.select("tpep_pickup_datetime", "trip_distance", "total_fee")
    return read_df

# TLC 데이터 읽어오기


- job 0 ~ 3 해당
- 분명히 union() 함수가 있는데 지금 job에서는 작동하지 않음
- rdd에 대해 이후에 나올 union도 생각해서 union()자체는 다음으로 미룬듯

In [87]:
# Data 타입, 범위
categories = ["yellow"]  # , "green" , "fhv", "fhvhv"]

# 날짜 범위 생성
months = (
    pd.date_range(start="2022-05-01", end="2022-08-01", freq="MS")
    .strftime("%Y-%m")
    .tolist()
)

# DataFrames 합치기
rdd = None
for category in categories:
    for month in months:
        print(">>>>>", category, month)
        read_df = data_purification(category, month)
        if read_df is not None:  # df가 None이 아닐 때만 결합
            if rdd is None:
                rdd = read_df
            else:
                rdd = rdd.union(read_df)

>>>>> yellow 2022-05
Attempting to read from: /Users/admin/Desktop/docker/W5M1/parquet_files/yellow_tripdata_2022-05.parquet
>>>>> yellow 2022-06
Attempting to read from: /Users/admin/Desktop/docker/W5M1/parquet_files/yellow_tripdata_2022-06.parquet
>>>>> yellow 2022-07
Attempting to read from: /Users/admin/Desktop/docker/W5M1/parquet_files/yellow_tripdata_2022-07.parquet
>>>>> yellow 2022-08
Attempting to read from: /Users/admin/Desktop/docker/W5M1/parquet_files/yellow_tripdata_2022-08.parquet


# 총 여행 수 계산

- job 4~5 해당
- 여기서 job 5에서 union() 실행 후 새로운 Stage로 넘어감
  - union() -> shuffling 일아난 것
- count() - Action

In [88]:
print(f"Total trip count sum: {rdd.count()}")



Total trip count sum: 12356633


                                                                                

# 총 수익 (요금의 합계) 계산

- job 6~7 해당
- 마찬가지로 job 6가 union(), job 7이 union() 이후에 만들어진 새로운 Stage에서 연산 수행

In [89]:
# total_fee 컬럼의 총합 계산
total_fee_sum = rdd.agg(sum("total_fee").alias("total_fee_sum")).collect()[0][
    "total_fee_sum"
]

print(f"Total fee sum: {total_fee_sum}")



Total fee sum: 297069546.05300593


                                                                                

# 평균 여행 거리

- job 8~9 해당
- 마찬가지로 job 8이 union(), job 9가 union() 이후에 만들어진 새로운 Stage에서 연산 수행

In [90]:
# 평균 trip_distance와 trip_duration 계산
average_trip_miles = rdd.agg(
    avg("trip_distance").alias("average_trip_miles"),
).collect()[0]["average_trip_miles"]


print(f"Average trip distance: {average_trip_miles:.2f} miles")



Average trip distance: 3.68 miles


                                                                                

### 하루 단위로 변환

In [91]:
# tpep_pickup_datetime에서 날짜 부분만 추출
groupby_date_rdd = rdd.withColumn("date", to_date(col("tpep_pickup_datetime")))

# 날짜별로 그룹화하여 총 수와 총 total_fee를 계산
groupby_date_rdd = groupby_date_rdd.groupBy("date").agg(
    count("date").alias("total_count"),
    sum("total_fee").alias("total_fee")
)

# 날짜 순으로 정렬
groupby_date_rdd = groupby_date_rdd.orderBy("date")

# 하루의 여행 수 계산

- job 10~13 해당
- 위의 union()과 비슷하게 '하루 단위로 변환'과정을 여기서 job으로 만드는듯
- job 10: union()로 새로운 Stage 생성
- job 11: groupBy로 새로운 Stage 생성
- job 12: orderBy()로 새로운 Stage 생성
- job 13: print()함

In [92]:
for date, total_count, total_fee in groupby_date_rdd.collect():
    print(f"Date: {date}, Total Count: {total_count}")



Date: 2022-05-01, Total Count: 93332
Date: 2022-05-02, Total Count: 89644
Date: 2022-05-03, Total Count: 114348
Date: 2022-05-04, Total Count: 119171
Date: 2022-05-05, Total Count: 125006
Date: 2022-05-06, Total Count: 118909
Date: 2022-05-07, Total Count: 110531
Date: 2022-05-08, Total Count: 97405
Date: 2022-05-09, Total Count: 102645
Date: 2022-05-10, Total Count: 115402
Date: 2022-05-11, Total Count: 121332
Date: 2022-05-12, Total Count: 126287
Date: 2022-05-13, Total Count: 122530
Date: 2022-05-14, Total Count: 118204
Date: 2022-05-15, Total Count: 100066
Date: 2022-05-16, Total Count: 108367
Date: 2022-05-17, Total Count: 117256
Date: 2022-05-18, Total Count: 119464
Date: 2022-05-19, Total Count: 122227
Date: 2022-05-20, Total Count: 117451
Date: 2022-05-21, Total Count: 116530
Date: 2022-05-22, Total Count: 95210
Date: 2022-05-23, Total Count: 105349
Date: 2022-05-24, Total Count: 115522
Date: 2022-05-25, Total Count: 116369
Date: 2022-05-26, Total Count: 115676
Date: 2022-05-27

                                                                                

# 하루의 통 수익 계산

- job 14 해당
- job 14: print()함

In [93]:
for date, total_count, total_fee in groupby_date_rdd.collect():
    print(f"Date: {date}, Total Fee: {total_fee}")

Date: 2022-05-01, Total Fee: 2348678.640000481
Date: 2022-05-02, Total Fee: 2167006.4300011224
Date: 2022-05-03, Total Fee: 2695469.619999525
Date: 2022-05-04, Total Fee: 2779941.249999259
Date: 2022-05-05, Total Fee: 2976566.919998659
Date: 2022-05-06, Total Fee: 2820044.0299989334
Date: 2022-05-07, Total Fee: 2444765.4600002267
Date: 2022-05-08, Total Fee: 2348931.5500006084
Date: 2022-05-09, Total Fee: 2487003.3600002327
Date: 2022-05-10, Total Fee: 2725806.8099994455
Date: 2022-05-11, Total Fee: 2860515.179999027
Date: 2022-05-12, Total Fee: 3034773.3099984694
Date: 2022-05-13, Total Fee: 2978493.229998475
Date: 2022-05-14, Total Fee: 2764630.109999187
Date: 2022-05-15, Total Fee: 2526912.670000028
Date: 2022-05-16, Total Fee: 2651111.2099997187
Date: 2022-05-17, Total Fee: 2963292.359998764
Date: 2022-05-18, Total Fee: 2990094.2199986842
Date: 2022-05-19, Total Fee: 3075315.3999983696
Date: 2022-05-20, Total Fee: 2928241.229998664
Date: 2022-05-21, Total Fee: 2818106.8999989484
Da

In [82]:
# 스파크 세션 정리
spark.stop()