In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("241210_04_yellowtaxi_trip_count").getOrCreate()
spark

24/12/11 16:31:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/11 16:31:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
import os

trip_files = '/trips/*'
zone_file = 'taxi+_zone_lookup.csv'
directory = os.path.join(os.getcwd(), 'data')

In [5]:
trips_df = spark.read.csv(f'file:///{directory}/{trip_files}', inferSchema=True, header=True)

                                                                                

In [6]:
zone_df = spark.read.csv(f'file:///{directory}/{zone_file}', inferSchema=True, header=True)

In [7]:
# # taxi+_zone_lookup.csv 파일 불러오기 
# df = spark.read.format("csv")\
#     .option("header",'true')\
#     .option('inferSchema', 'true')\
#     .load("data/taxi+_zone_lookup.csv")

# # yellow_tripdata_2021-01.csv 파일 불러오기 
# trip_202101_df = spark.read.format("csv")\
#     .option("header", 'true')\
#     .option('inferSchema', 'true')\
#     .load('data/yellow_tripdata_2021-01.csv')

# # yellow_tripdata_2021-02.csv 파일 불러오기 
# trip_202102_df = spark.read.format("csv")\
#     .option("header", 'true')\
#     .option('inferSchema', 'true')\
#     .load('data/yellow_tripdata_2021-02.csv')

In [8]:
trips_df.printSchema()
zone_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [9]:
trips_df.show(5)
zone_df.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|       2| 2021-03-01 00:22:02|  2021-03-01 00:23:22|              1|          0.0|         1|                 N|         264|         264|           2|        3.0|  0.5|    0.5|       0.0|         0.0|                  0.3

In [10]:
trips_df.createOrReplaceTempView('trips')
zone_df.createOrReplaceTempView('zone')

In [11]:
query = '''
SELECT
t.VendorID,
To_DATE(t.tpep_pickup_datetime) as pickup_date,
To_DATE(t.tpep_dropoff_datetime) as dropoff_date,
HOUR(t.tpep_pickup_datetime) as pickup_time,
HOUR(t.tpep_dropoff_datetime) as dropoff_time,
t.passenger_count,
t.trip_distance,
t.tip_amount,
t.total_amount,
t.payment_type,
pz.Zone as pickup_zone,
dz.Zone as dropoff_zone,
T.congestion_surcharge  

FROM trips t

LEFT JOIN zone pz ON t.PULocationID = pz.LocationID
LEFT JOIN zone dz ON t.DOLocationID = dz.LocationID
'''
comb_df = spark.sql(query)
comb_df.show()

+--------+-----------+------------+-----------+------------+---------------+-------------+----------+------------+------------+--------------------+--------------------+--------------------+
|VendorID|pickup_date|dropoff_date|pickup_time|dropoff_time|passenger_count|trip_distance|tip_amount|total_amount|payment_type|         pickup_zone|        dropoff_zone|congestion_surcharge|
+--------+-----------+------------+-----------+------------+---------------+-------------+----------+------------+------------+--------------------+--------------------+--------------------+
|       2| 2021-03-01|  2021-03-01|          0|           0|              1|          0.0|       0.0|         4.3|           2|                  NV|                  NV|                 0.0|
|       2| 2021-03-01|  2021-03-01|          0|           0|              1|          0.0|       0.0|         3.8|           2|      Manhattanville|      Manhattanville|                 0.0|
|       2| 2021-03-01|  2021-03-01|          

In [12]:
# comb_df를 view 로 등록 
comb_df.createOrReplaceTempView("comb")

In [13]:
query = '''
SELECT pickup_date, pickup_time
FROM comb
WHERE pickup_time > 0 
'''

spark.sql(query).show()

+-----------+-----------+
|pickup_date|pickup_time|
+-----------+-----------+
| 2021-02-28|         23|
| 2021-02-28|         23|
| 2021-02-28|         23|
| 2021-02-28|         23|
| 2021-02-28|         23|
| 2021-02-28|         23|
| 2021-02-28|         23|
| 2021-03-01|         22|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
+-----------+-----------+
only showing top 20 rows



In [14]:
query = '''
SELECT pickup_date, pickup_time
FROM comb
WHERE pickup_date < '2020-12-31' 
'''

spark.sql(query).show()



+-----------+-----------+
|pickup_date|pickup_time|
+-----------+-----------+
| 2009-01-01|          0|
| 2008-12-31|         23|
| 2009-01-01|          0|
| 2009-01-01|          0|
| 2009-01-01|          0|
| 2009-01-01|          0|
| 2009-01-01|          0|
| 2009-01-01|          1|
| 2009-01-01|          0|
| 2008-12-31|         23|
| 2008-12-31|         23|
| 2008-12-31|         23|
| 2008-12-31|         23|
| 2009-01-01|          0|
| 2009-01-01|          0|
| 2009-01-01|          0|
| 2009-01-01|         16|
| 2009-01-01|         16|
| 2009-01-01|          0|
| 2009-01-01|          0|
+-----------+-----------+
only showing top 20 rows



                                                                                

In [None]:
comb_df.describe()

In [None]:
spark.sql(query).explain()

In [16]:
query1 = '''
SELECT pickup_date, pickup_time
FROM comb
WHERE pickup_date < '2020-12-31'

'''
spark.sql(query1)

DataFrame[pickup_date: date, pickup_time: int]

In [17]:
#실행 계획, 실행 결과 (4040)
query2 = '''
SELECT pickup_date, pickup_time
FROM comb
WHERE pickup_time > 0 and pickup_time <= 12
'''
spark.sql(query2).show()



+-----------+-----------+
|pickup_date|pickup_time|
+-----------+-----------+
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
+-----------+-----------+
only showing top 20 rows



                                                                                

In [31]:
#실행 계획, 실행 결과 (4040)
query3 = '''
select pickup_date , count(*) as trip_count
from comb 
where pickup_time > 0
group by pickup_date
order by pickup_date
'''
spark.sql(query3).show()



+-----------+----------+
|pickup_date|trip_count|
+-----------+----------+
| 2002-12-31|         1|
| 2003-01-05|         1|
| 2004-04-04|         1|
| 2008-12-31|        26|
| 2009-01-01|        46|
| 2020-12-31|        16|
| 2021-01-01|     23649|
| 2021-01-02|     34045|
| 2021-01-03|     25881|
| 2021-01-04|     44210|
| 2021-01-05|     46539|
| 2021-01-06|     49210|
| 2021-01-07|     50013|
| 2021-01-08|     50122|
| 2021-01-09|     39407|
| 2021-01-10|     29275|
| 2021-01-11|     46568|
| 2021-01-12|     49878|
| 2021-01-13|     51045|
| 2021-01-14|     52763|
+-----------+----------+
only showing top 20 rows



                                                                                

In [20]:
# 1. 운행 거리와 요금의 상관 관계 분석
query_1 = '''
SELECT trip_distance, ROUND(AVG(total_amount),2) as avg_total_amount
FROM comb
GROUP BY trip_distance
ORDER BY avg_total_amount DESC
'''
spark.sql(query_1).show() 



+-------------+----------------+
|trip_distance|avg_total_amount|
+-------------+----------------+
|       964.27|          2413.8|
|        427.7|          2292.4|
|       821.54|          2059.3|
|        267.7|          1108.2|
|        260.5|           894.2|
|        271.4|          872.54|
|       344.88|           865.3|
|        323.0|          838.05|
|       215.95|          832.54|
|        282.1|          736.85|
|        270.2|           715.3|
|       161.38|          682.48|
|       258.98|           654.8|
|       170.26|           653.1|
|        99.48|          645.21|
|        220.4|          631.42|
|       247.37|           621.3|
|       224.16|           602.3|
|       103.98|           601.8|
|       277.74|           600.3|
+-------------+----------------+
only showing top 20 rows





In [30]:
# 2. 피크 시간대 요금 분석 - 가장 운행 기록이 많은 시간 대

query_2 = '''
SELECT pickup_time, count(*) as trip_count, Round(SUM(total_amount),2) as Total_amount
FROM comb
GROUP BY pickup_time
ORDER BY trip_count DESC
'''
spark.sql(query_2).show(5) 



+-----------+----------+-------------+
|pickup_time|trip_count| Total_amount|
+-----------+----------+-------------+
|         15|   1091871|2.034362246E7|
|         18|   1087217|2.020262881E7|
|         17|   1085226|2.093609435E7|
|         14|   1084111|1.970667334E7|
|         16|   1043498|2.056582042E7|
+-----------+----------+-------------+
only showing top 5 rows





In [34]:
# 3. 지불 유형별 요금, 팁 분석
query_3 = '''
SELECT payment_type, count(*) AS payment_count,SUM(tip_amount) as total_tips, SUM(total_amount) AS total_amount
FROM comb
GROUP BY payment_type
ORDER BY payment_type
'''
spark.sql(query_3).show()



+------------+-------------+-------------------+-------------------+
|payment_type|payment_count|         total_tips|       total_amount|
+------------+-------------+-------------------+-------------------+
|        null|       834028| 1335652.4599999941|2.656643904000006E7|
|           1|     10716903|3.086741502000679E7|2.024484666710695E8|
|           2|      3308670| 1138.6399999999999|5.102389678972281E7|
|           3|        81434|-1092.5699999999997| 1471426.3099999826|
|           4|        59664|  352.5900000000001| -165336.9399999972|
|           5|            1|                0.0|               17.8|
+------------+-------------+-------------------+-------------------+





In [37]:
# 4. 승차지역/하차지역별 평균거리, 요금

query_4 = '''
SELECT pickup_zone, dropoff_zone, avg(trip_distance) as avg_trip_distance, sum(total_amount) as total_amount
FROM comb
GROUP BY pickup_zone,dropoff_zone
ORDER BY avg_trip_distance DESC
'''
spark.sql(query_4).show()



+--------------------+--------------------+------------------+------------------+
|         pickup_zone|        dropoff_zone| avg_trip_distance|      total_amount|
+--------------------+--------------------+------------------+------------------+
|     Jamaica Estates|      Pelham Parkway|         205266.12|             47.63|
|    Inwood Hill Park|        Saint Albans|         167305.08|              76.0|
|     Oakland Gardens|Flatbush/Ditmas Park|         131123.07|             61.18|
|            Longwood|              Inwood|          77093.46|             19.04|
|   East Williamsburg|Riverdale/North R...|         70968.425|            106.44|
|              Inwood|         Parkchester| 65796.09250000001|128.29000000000002|
|        Borough Park|              Corona|          63336.38|101.24000000000001|
|       South Jamaica|         City Island|          56564.13|             51.08|
| Kingsbridge Heights|Queensbridge/Rave...|          54215.12| 66.53999999999999|
|     Jamaica Es

                                                                                

In [82]:
# 5. 팁의 비율에 따른 거리

query_5 = '''
SELECT 
    (tip_amount/total_amount)*100 as tips_percentage,
    AVG(trip_distance) AS avg_trip_distance
FROM comb
GROUP BY tips_percentage
ORDER BY avg_trip_distance DESC
'''
spark.sql(query_5).show()



+------------------+-----------------+
|   tips_percentage|avg_trip_distance|
+------------------+-----------------+
|17.480508822322527|        233409.53|
| 9.847560975609756|        211716.26|
|  9.90489445604268|        184721.27|
| 5.301524188204108|        160028.23|
|15.181370353784148|         157437.0|
| 2.933411557641537|        139554.99|
|15.209345362017118|        128684.89|
|17.816901408450704|        124862.06|
| 9.696569920844327|        122329.58|
|17.776491024898668|        114098.94|
| 13.83960255500355|        108698.15|
| 15.19947961838682|        102002.08|
| 2.035710224041851|        101478.96|
|11.435726210350584|         98633.14|
| 4.429232937386753|         97981.59|
|18.621420996818664|         95884.15|
|14.044032444959441|         95861.41|
| 24.36785311644387|          94574.8|
|17.501180916391117|         94346.36|
| 2.120669056152927|         94305.28|
+------------------+-----------------+
only showing top 20 rows





In [83]:
# 팁 카테고리 지정 

query_5 = '''
SELECT 
    CASE 
        WHEN tips_percentage < 20 THEN "0~20%"
        WHEN tips_percentage < 40 THEN "20~40%"
        WHEN tips_percentage < 60 THEN "40~60%"
        WHEN tips_percentage < 80 THEN "60~80%"
        ELSE "80% 이상"
    END as tip_category,
    AVG(trip_distance) AS avg_trip_distance
FROM (
    SELECT 
        (tip_amount / total_amount) * 100 as tips_percentage,
        trip_distance
    FROM comb
) AS subquery
GROUP BY tip_category
ORDER BY tip_category
'''

spark.sql(query_5).show()




+------------+------------------+
|tip_category| avg_trip_distance|
+------------+------------------+
|       0~20%| 7.048115131400676|
|      20~40%| 3.240359212749815|
|      40~60%| 7.908702209918381|
|      60~80%| 1.268032936870997|
|    80% 이상|1.0794818385264247|
+------------+------------------+



                                                                                

In [59]:
#6. 지역별 혼잡도 추가 요금 확인
query_6 = '''
SELECT pickup_zone, dropoff_zone, avg(congestion_surcharge) as avg_congestion_surcharge
FROM comb
GROUP BY pickup_zone, dropoff_zone
ORDER BY avg_congestion_surcharge DESC
'''
spark.sql(query_6).show()



+--------------------+--------------------+------------------------+
|         pickup_zone|        dropoff_zone|avg_congestion_surcharge|
+--------------------+--------------------+------------------------+
|Long Island City/...|        Bloomingdale|                     2.5|
|Washington Height...|          Kensington|                     2.5|
|     Hammels/Arverne|                SoHo|                     2.5|
|Long Island City/...|     Highbridge Park|                     2.5|
|     Lenox Hill West|             Oakwood|                     2.5|
|        West Village|South Beach/Donga...|                     2.5|
|        Battery Park|     Windsor Terrace|                     2.5|
|      Yorkville East|Bay Terrace/Fort ...|                     2.5|
|     Carroll Gardens|Upper West Side S...|                     2.5|
|        Bloomingdale|           Chinatown|                     2.5|
|Saint Michaels Ce...|      Midtown Center|                     2.5|
|       Port Richmond|            

                                                                                

In [65]:
spark.stop()