# Try this with Dask First

As with Question 5 I'm going to try this firstly with Dask and then with Spark. 

In [1]:
#just finding my exact file name here...
!ls

03_test.ipynb	       fhvhv_tripdata_2021-01.csv
04_pyspark.ipynb       fhvhv_tripdata_2021-01.csv.1
05_taxi_schema.ipynb   fhvhv_tripdata_2021-02.csv
06_spark_sql.ipynb     fhvhv_tripdata_2021-02_homeworkQ3.ipynb
07_groupby_join.ipynb  fhvhv_tripdata_2021-02_homeworkQ4.ipynb
08_rdds.ipynb	       head.csv
data		       HomeworkQ3.ipynb
download_data.sh       test.py
fhvhv


In [11]:
import dask.dataframe as dd

In [12]:
filename = 'fhvhv_tripdata_2021-02.csv'
df = dd.read_csv(filename, parse_dates=['pickup_datetime', 'dropoff_datetime'])

In [17]:
value_counts_df = df['dispatching_base_num'].value_counts().compute()
#print(df.compute()) # this step took around 5 minutes...

# Answer in Dask

In [18]:
value_counts_df

B02510    3233664
B02764     965568
B02872     882689
B02875     685390
B02765     559768
B02869     429720
B02887     322331
B02871     312364
B02864     311603
B02866     311089
B02878     305185
B02682     303255
B02617     274510
B02883     251617
B02884     244963
B02882     232173
B02876     215693
B02879     210137
B02867     200530
B02877     198938
B02835     189031
B02888     169167
B02889     138762
B02836     128978
B02880     115716
B02395     112433
B02870     101945
B02800      84277
B02865      76160
B02512      41043
B02844       3502
B03136       1741
Name: dispatching_base_num, dtype: int64

# Now try with pyspark

In [62]:
import pyspark
from pyspark.sql import SparkSession


In [63]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('fhvhv_tripdata_2021-02_homework') \
    .getOrCreate()

In [64]:
#!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2021-02.csv

In [65]:
#!wc -l fhvhv_tripdata_2021-02.csv

In [66]:
from pyspark.sql import types

In [67]:
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True)
])

I don't need to bother repartitioning etc. here as I've already done it before. I'll turn these code cells into markdown cells and simply start off from reading the parquet file

df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('fhvhv_tripdata_2021-02.csv')

df = df.repartition(24)

df.write.parquet('fhvhv/2021/02/')

In [68]:
df = spark.read.parquet('fhvhv/2021/02/')

In [69]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



In [84]:
# One way...
from pyspark.sql.functions import *

df_grouped_desc = df \
    .groupBy(['dispatching_base_num']).count() \
    .sort(desc('count')) \
    .show()




+--------------------+-------+
|dispatching_base_num|  count|
+--------------------+-------+
|              B02510|3233664|
|              B02764| 965568|
|              B02872| 882689|
|              B02875| 685390|
|              B02765| 559768|
|              B02869| 429720|
|              B02887| 322331|
|              B02871| 312364|
|              B02864| 311603|
|              B02866| 311089|
|              B02878| 305185|
|              B02682| 303255|
|              B02617| 274510|
|              B02883| 251617|
|              B02884| 244963|
|              B02882| 232173|
|              B02876| 215693|
|              B02879| 210137|
|              B02867| 200530|
|              B02877| 198938|
+--------------------+-------+
only showing top 20 rows





# Answer in Pyspark

In [85]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('fhvhv_tripdata_2021-02_homework') \
    .getOrCreate()

In [86]:
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True)
])

In [87]:
df = spark.read.parquet('fhvhv/2021/02/')

In [89]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("fhvhv_2021_02")

sqlDF = spark.sql("SELECT dispatching_base_num, \
                          count(1) \
                     FROM fhvhv_2021_02 \
                    GROUP BY dispatching_base_num \
                    ORDER BY count(1) DESC")
sqlDF.show()



+--------------------+--------+
|dispatching_base_num|count(1)|
+--------------------+--------+
|              B02510| 3233664|
|              B02764|  965568|
|              B02872|  882689|
|              B02875|  685390|
|              B02765|  559768|
|              B02869|  429720|
|              B02887|  322331|
|              B02871|  312364|
|              B02864|  311603|
|              B02866|  311089|
|              B02878|  305185|
|              B02682|  303255|
|              B02617|  274510|
|              B02883|  251617|
|              B02884|  244963|
|              B02882|  232173|
|              B02876|  215693|
|              B02879|  210137|
|              B02867|  200530|
|              B02877|  198938|
+--------------------+--------+
only showing top 20 rows





In [19]:
# Getting just the max time
sqlDF = spark.sql("SELECT MAX(trip_length_sec) \
                     FROM fhvhv_2021_02_trip_duration")
sqlDF.show()



+--------------------+
|max(trip_length_sec)|
+--------------------+
|               75540|
+--------------------+



                                                                                

In [21]:
# Combine the above into another way of finding out

sqlDF = spark.sql("SELECT hvfhs_license_num, dispatching_base_num, \
                          pickup_datetime, dropoff_datetime, trip_length_sec \
                     FROM fhvhv_2021_02_trip_duration \
                    WHERE trip_length_sec == ( \
                        SELECT MAX(trip_length_sec) \
                          FROM fhvhv_2021_02_trip_duration) \
                  ")
sqlDF.show()

[Stage 9:>                                                          (0 + 3) / 3]

+-----------------+--------------------+-------------------+-------------------+---------------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|trip_length_sec|
+-----------------+--------------------+-------------------+-------------------+---------------+
|           HV0005|              B02510|2021-02-11 13:40:44|2021-02-12 10:39:44|          75540|
+-----------------+--------------------+-------------------+-------------------+---------------+



